Esempio n. 1
0
def test_clodius_aggregate_bigwig():
    runner = clt.CliRunner()
    input_file = op.join(testdir, 'sample_data', 'test.tile_generation.bw')
    print("input_file:", input_file)

    with open('/tmp/test_chrs.tsv', 'w') as f:
        f.write('{}\t{}'.format('test', 100000))

    result = runner.invoke(cca.bigwig, [
        input_file, '--chromsizes-filename', '/tmp/test_chrs.tsv',
        '--output-file', '/tmp/test.mr.bw'
    ])

    import traceback
    print("exc_info:", result.exc_info)
    a, b, tb = result.exc_info
    print("result:", result)
    print("result.output", result.output)
    print("result.error", traceback.print_tb(tb))
    print("Exception:", a, b)

    import clodius.hdf_tiles as ch

    filename = '/tmp/test.mr.bw'
    f = h5py.File(filename)

    max_zoom = f['meta'].attrs['max-zoom']
    tile_size = int(f['meta'].attrs['tile-size'])

    d = ch.get_data(f, max_zoom, 0)
    print("d:", d)

    # lowest zoom should have values of 1
    for i in range(tile_size):
        assert (d[i] == 1)

    d = ch.get_data(f, max_zoom - 1, 0)
    print("d:", d)

    for i in range(tile_size):
        # because we're taking averages
        assert (d[i] == 1)

    d = ch.get_data(f, max_zoom - 2, 0)

    for i in range(tile_size // 2):
        assert (d[i] == 1)

    print("hey")
    assert (d[513] == 1)

    assert (result.exit_code == 0)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(description="""

    python get_hitile.py filename z x
""")

    parser.add_argument('filename')
    parser.add_argument('z', type=int)
    parser.add_argument('x', type=int)
    #parser.add_argument('argument', nargs=1)
    #parser.add_argument('-o', '--options', default='yo',
    #					 help="Some option", type='str')
    #parser.add_argument('-u', '--useless', action='store_true',
    #					 help='Another useless option')

    args = parser.parse_args()

    with h5py.File(args.filename, 'r') as f:
        tileset_info = hdft.get_tileset_info(f)
        max_width = tileset_info['max_width']
        max_pos = tileset_info['max_pos']
        tile_size = tileset_info['tile_size']

        print("max_width", max_width)
        print("max_pos", max_pos)

        last_index = int(tile_size * (max_pos / max_width))
        print("last_index:", last_index)
        tile_data = hdft.get_data(f, args.z, args.x)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description="""

    python read.py hdf_file
""")

    parser.add_argument("filepath")
    parser.add_argument("-z", default=None, type=int)
    parser.add_argument("-x", default=None, type=int)

    parser.add_argument("-n", "--num-trials", default=1, type=int)
    # parser.add_argument('argument', nargs=1)
    # parser.add_argument('-o', '--options', default='yo',
    # help="Some option", type='str')
    # parser.add_argument('-u', '--useless', action='store_true',
    # help='Another useless option')

    args = parser.parse_args()

    f = h5py.File(args.filepath, "r")

    t1 = time.time()

    if args.num_trials < 1:
        print("The number of trials needs to be greater than 0",
              file=sys.stderr)

    if args.x is not None and args.z is not None:
        d = ch.get_data(f, args.z, args.x)
        print("z:", args.z, "x:", args.x, "len:", len(d), d)
        return

    for i in range(args.num_trials):
        z = random.randint(0, int(f["meta"].attrs["max-zoom"]))
        x = random.randint(0, 2**z)

        d = ch.get_data(f, z, x)
        print("z:", z, "x:", x, "len:", len(d), d)
        # d =  ch.get_data(f, 1, 1)

        # print "z:", z, "x:", x
    t2 = time.time()
    print("avg time:", (t2 - t1) / args.num_trials)
    """
def generate_hitile_tiles(tileset, tile_ids):
    '''
    Generate tiles from a hitile file.

    Parameters
    ----------
    tileset: tilesets.models.Tileset object
        The tileset that the tile ids should be retrieved from
    tile_ids: [str,...]
        A list of tile_ids (e.g. xyx.0.0) identifying the tiles
        to be retrieved

    Returns
    -------
    tile_list: [(tile_id, tile_data),...]
        A list of tile_id, tile_data tuples
    '''
    generated_tiles = []

    for tile_id in tile_ids:
        tile_id_parts = tile_id.split('.')
        tile_position = list(map(int, tile_id_parts[1:3]))

        dense = hdft.get_data(h5py.File(tileset.datafile.path),
                              tile_position[0], tile_position[1])

        if len(dense):
            max_dense = max(dense)
            min_dense = min(dense)
        else:
            max_dense = 0
            min_dense = 0

        min_f16 = np.finfo('float16').min
        max_f16 = np.finfo('float16').max

        has_nan = len([d for d in dense if np.isnan(d)]) > 0

        if (not has_nan and max_dense > min_f16 and max_dense < max_f16
                and min_dense > min_f16 and min_dense < max_f16):
            tile_value = {
                'dense':
                base64.b64encode(dense.astype('float16')).decode('utf-8'),
                'dtype': 'float16'
            }
        else:
            tile_value = {
                'dense':
                base64.b64encode(dense.astype('float32')).decode('utf-8'),
                'dtype': 'float32'
            }

        generated_tiles += [(tile_id, tile_value)]

    return generated_tiles
Esempio n. 5
0
def check_1d_file(filename):
    f = h5py.File(filename)

    max_zoom = f['meta'].attrs['max-zoom']
    tile_size = int(f['meta'].attrs['tile-size'])

    d = ch.get_data(f, max_zoom, 0)

    # lowest zoom should have values of 1
    for i in range(tile_size):
        assert (d[i] == 1)

    d = ch.get_data(f, max_zoom - 1, 0)

    for i in range(tile_size):
        assert (d[i] == 2)

    d = ch.get_data(f, max_zoom - 2, 0)

    for i in range(tile_size // 2):
        assert (d[i] == 4)

    assert (d[513] == 4)
Esempio n. 6
0
def test_clodius_aggregate_bedgraph():
    input_file = op.join(testdir, 'sample_data', 'cnvs_hw.tsv')
    assembly_file = op.join(testdir, 'sample_data', 'test_cnvs_assembly')
    output_file = '/tmp/cnvs_hw.hitile'

    # run once to make sure it doesn't crash on a smaller genome
    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [
            input_file,
            '--output-file',
            output_file,
            # '--assembly', 'grch37',
            '--chromsizes-filename',
            assembly_file,
            '--chromosome-col',
            '2',
            '--from-pos-col',
            '3',
            '--to-pos-col',
            '4',
            '--value-col',
            '5',
            '--has-header',
            '--nan-value',
            'NA'
        ])

    # run again with the proper assembly
    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [
            input_file,
            '--output-file',
            output_file,
            '--assembly',
            'grch37',
            # '--chromsizes-filename', assembly_file,
            '--chromosome-col',
            '2',
            '--from-pos-col',
            '3',
            '--to-pos-col',
            '4',
            '--value-col',
            '5',
            '--has-header',
            '--nan-value',
            'NA'
        ])
    '''
    import traceback
    a,b,tb = result.exc_info
    print("exc_info:", result.exc_info)
    print("result:", result)
    print("result.output", result.output)
    print("result.error", traceback.print_tb(tb))
    print("Exception:", a,b)
    '''

    assert (result.exit_code == 0)
    f = h5py.File(output_file)
    # print("tile_0_0", d)

    # print("tile:", cht.get_data(f, 22, 0))
    # return
    d = cht.get_data(f, 0, 0)

    assert (not np.isnan(d[0]))
    assert (np.isnan(d[-1]))
    cht.get_data(f, 3, 0)
    # TODO: Make assertions about result

    # print("prev_tile_3_0:", prev_tile_3_0)

    assert (result.exit_code == 0)
Esempio n. 7
0
def test_clodius_aggregate_bedgraph1():
    input_file = op.join(testdir, 'sample_data', 'dm3_values.tsv')
    output_file = '/tmp/dm3_values.hitile'

    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [input_file, '--output-file', output_file, '--assembly', 'dm3'])

    a, b, tb = result.exc_info
    """
    print("exc_info:", result.exc_info)
    print("result:", result)
    print("result.output", result.output)
    print("result.error", traceback.print_tb(tb))
    print("Exception:", a,b)
    """

    # print("result.output", result.output)

    f = h5py.File('/tmp/dm3_values.hitile')
    # max_zoom = f['meta'].attrs['max-zoom']
    # TODO: Make assertions about result
    values = f['values_0']

    import numpy as np
    # print("values:", values[8])
    # genome positions are 0 based as stored in hitile files
    assert (np.isnan(values[8]))
    assert (values[9] == 1)
    assert (values[10] == 1)
    assert (values[13] == 1)
    assert (np.isnan(values[14]))
    assert (np.isnan(values[15]))

    chrom_info = nc.get_chrominfo('dm3')
    chr_2r_pos = nc.chr_pos_to_genome_pos('chr2R', 0, chrom_info)
    # print('chr_2r_pos:', chr_2r_pos)

    assert (np.isnan(values[chr_2r_pos + 28]))
    assert (values[chr_2r_pos + 29] == 77)
    assert (values[chr_2r_pos + 38] == 77)
    assert (values[chr_2r_pos + 39] == 0)

    assert (result.exit_code == 0)

    d = cht.get_data(f, 0, 0)
    # print("d[:10]", d[:10])
    # print("sum(d):", sum([x for x in d if not np.isnan(x)]))
    assert (np.nansum(d) > 1.0 and np.nansum(d) < 10.0)

    return

    input_file = op.join(testdir, 'sample_data', 'test3chroms_values.tsv')
    output_file = '/tmp/test3chroms_values.hitile'

    runner = clt.CliRunner()
    result = runner.invoke(cca.bedgraph, [
        input_file, '--output-file', output_file, '--assembly', 'test3chroms'
    ])

    # print('output:', result.output, result)

    f = h5py.File('/tmp/test3chroms_values.hitile')
    # f['meta'].attrs['max-zoom']
    # TODO: Make assertions about result

    # print('max_zoom:', max_zoom)
    # print("len", len(f['values_0']))

    values = f['values_0']

    # print('values', values[:100])

    # genome positions are 0 based as stored in hitile files
    assert (values[8] == 0)
    assert (values[9] == 1)
    assert (values[10] == 1)
    assert (values[13] == 1)
    assert (values[14] == 0)
    assert (values[15] == 0)

    chr2_pos = nc.chr_pos_to_genome_pos('chr2', 0, 'test3chroms')

    assert (values[chr2_pos + 28] == 0)
    assert (values[chr2_pos + 29] == 77)
    assert (values[chr2_pos + 38] == 77)
    assert (values[chr2_pos + 39] == 0)

    assert (result.exit_code == 0)

    d = cht.get_data(f, 0, 0)
    assert (sum(d) == 770 + 880 + 5)
Esempio n. 8
0
def test_clodius_aggregate_bedgraph():
    input_file = op.join(testdir, "sample_data", "cnvs_hw.tsv")
    assembly_file = op.join(testdir, "sample_data", "test_cnvs_assembly")
    output_file = "/tmp/cnvs_hw.hitile"

    # run once to make sure it doesn't crash on a smaller genome
    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [
            input_file,
            "--output-file",
            output_file,
            # '--assembly', 'grch37',
            "--chromsizes-filename",
            assembly_file,
            "--chromosome-col",
            "2",
            "--from-pos-col",
            "3",
            "--to-pos-col",
            "4",
            "--value-col",
            "5",
            "--has-header",
            "--nan-value",
            "NA",
        ],
    )

    # run again with the proper assembly
    runner = clt.CliRunner()
    result = runner.invoke(
        cca.bedgraph,
        [
            input_file,
            "--output-file",
            output_file,
            "--assembly",
            "grch37",
            # '--chromsizes-filename', assembly_file,
            "--chromosome-col",
            "2",
            "--from-pos-col",
            "3",
            "--to-pos-col",
            "4",
            "--value-col",
            "5",
            "--has-header",
            "--nan-value",
            "NA",
        ],
    )
    """
    import traceback
    a,b,tb = result.exc_info
    print("exc_info:", result.exc_info)
    print("result:", result)
    print("result.output", result.output)
    print("result.error", traceback.print_tb(tb))
    print("Exception:", a,b)
    """

    assert result.exit_code == 0
    f = h5py.File(output_file, "r")
    # print("tile_0_0", d)

    # print("tile:", cht.get_data(f, 22, 0))
    # return
    d = cht.get_data(f, 0, 0)

    assert not np.isnan(d[0])
    assert np.isnan(d[-1])
    cht.get_data(f, 3, 0)
    # TODO: Make assertions about result

    # print("prev_tile_3_0:", prev_tile_3_0)

    assert result.exit_code == 0
Esempio n. 9
0
import clodius.hdf_tiles as cht
import h5py
import sys

f = h5py.File('test_chr14.hitile')
tile = cht.get_data(f, 12, 3316)
sys.exit(1)

max_zoom = 17
pos = 117440512 - 30
for i in range(max_zoom):
    tile_pos = pos / (1024 * 2**(max_zoom - i))
    tile = cht.get_data(f, i, tile_pos)
    print("z", i, "tile_pos:", tile_pos, "data", tile)
Esempio n. 10
0
def generate_tile(tile_id, request):
    '''
    Create a tile. The tile_id specifies the dataset as well
    as the position.

    This function will look at the filetype and determine what type
    of tile to retrieve (e..g cooler -> 2D dense, hitile -> 1D dense,
    elasticsearch -> anything)

    Args:
        tile_id (str): The id of a tile, consisting of the tileset id,
            followed by the tile position (e.g. PIYqJpdyTCmAZGmA6jNHJw.4.0.0)
        request (django.http.HTTPRequest): The request that included this tile.

    Returns:
        (string, dict): A tuple containing the tile ID tile data
    '''

    tile_id_parts = tile_id.split('.')
    tile_position = map(int, tile_id_parts[1:])
    tileset_uuid = tile_id_parts[0]

    tileset = tm.Tileset.objects.get(uuid=tileset_uuid)

    if tileset.private and request.user != tileset.owner:
        # dataset is not public return an empty set
        return (tileset_uuid, {'error': "Forbidden"})

    tile_value = rdb.get(tile_id)
    if tile_value is not None:
        tile_value = pickle.loads(tile_value)
        return (tile_id, tile_value)

    if tileset.filetype == "hitile":
        dense = hdft.get_data(
            h5py.File(
                get_datapath(tileset.datafile)
            ),
            tile_position[0],
            tile_position[1]
        )
        tile_value = {'dense': base64.b64encode(dense)}

    elif tileset.filetype == 'beddb':
        tile_value = cdt.get_tile(
            get_datapath(tileset.datafile),
            tile_position[0],
            tile_position[1]
        )

    elif tileset.filetype == 'bed2ddb':
        tile_value = cdt.get_2d_tile(
            get_datapath(tileset.datafile),
            tile_position[0],
            tile_position[1],
            tile_position[2]
        )

    elif tileset.filetype == 'hibed':
        dense = hdft.get_discrete_data(
            h5py.File(
                get_datapath(tileset.datafile)
            ),
            tile_position[0],
            tile_position[1]
        )

        tile_value = {'discrete': list([list(d) for d in dense])}

    elif tileset.filetype == "elasticsearch":
        response = urllib.urlopen(
            tileset.datafile + '/' + '.'.join(map(str, tile_position))
        )
        tile_value = json.loads(response.read())["_source"]["tile_value"]

    else:
        tile_value = make_cooler_tile(
            get_datapath(tileset.datafile), tile_position
        )
        if tile_value is None:
            return None

    rdb.set(tile_id, pickle.dumps(tile_value))
    return (tile_id, tile_value)