Esempi in Python per chromsizes, esempi in Python per bbi.chromsizes

Esempio n. 1

0

Mostra file

File: bigwig.py Progetto: younglululu/peax

def chunk(bigwig, window_size, step_size, aggregation, chroms, verbose=False):
    base_bins = math.ceil(window_size / aggregation)

    chrom_values = []

    for chrom in chroms:
        if chrom not in bbi.chromsizes(bigwig):
            print(
                "Skipping chrom (not in bigWig file):",
                chrom,
                bbi.chromsizes(bigwig)[chrom],
            )
            continue

        chrom_size = bbi.chromsizes(bigwig)[chrom]

        values = np.zeros((math.ceil(
            (chrom_size - step_size) / step_size), base_bins))
        starts = np.arange(0, chrom_size - step_size, step_size)
        ends = np.append(np.arange(window_size, chrom_size, step_size),
                         chrom_size)
        bins = window_size / aggregation

        # Extract all but the last window in one fashion (faster than `fetch`
        # with loops)
        values[:-1] = bbi.stackup(
            bigwig,
            [chrom] * (starts.size - 1),
            starts[:-1],
            ends[:-1],
            bins=bins,
            missing=0.0,
        )
        final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation)
        # Extract the last window separately because it's size is likely to be
        # different from the others
        values[-1, :final_bins] = bbi.fetch(bigwig,
                                            chrom,
                                            starts[-1],
                                            ends[-1],
                                            bins=final_bins,
                                            missing=0.0)

        if verbose:
            print(
                "Chrom: {}".format(chrom),
                "# win: {}".format(values.shape[0]),
                "Max:   {}".format(np.max(values)),
            )

        chrom_values.append(values)

    return chrom_values

Esempio n. 2

0

Mostra file

def tileset_info(bwpath):
    '''
    Get the tileset info for a bigWig file

    Parameters
    ----------
    bwpath: string
        The path to the bigwig file from which to retrieve data

    Returns
    -------
    tileset_info: {'min_pos': [], 
                    'max_pos': [], 
                    'tile_size': 1024, 
                    'max_zoom': 7
                    }
    '''
    TILE_SIZE = 1024
    chromsizes = bbi.chromsizes(bwpath)
    chromosomes = cooler.util.natsorted(chromsizes.keys())
    chromsizes = pd.Series(chromsizes)[chromosomes]
    min_tile_cover = np.ceil(sum(chromsizes) / TILE_SIZE)
    max_zoom = int(np.ceil(np.log2(min_tile_cover)))
    tileset_info = {
        'min_pos': [0],
        'max_pos': [TILE_SIZE * 2**max_zoom],
        'max_width': TILE_SIZE * 2**max_zoom,
        'tile_size': TILE_SIZE,
        'max_zoom': max_zoom
    }
    return tileset_info

Esempio n. 3

0

Mostra file

def tileset_info(bwpath):
    """Get the tileset info for a bigWig file

    Parameters
    ----------
    bwpath: string
        Path to the bigwig file

    Returns
    -------
    tileset_info: {
        'min_pos': [],
        'max_pos': [],
        'max_width': 131072
        'tile_size': 1024,
        'max_zoom': 7
    }
    """
    TILE_SIZE = 1024
    chromsizes = bbi.chromsizes(bwpath)
    chromosomes = cooler.util.natsorted(chromsizes.keys())
    chromsizes = pd.Series(chromsizes)[chromosomes]
    min_tile_cover = np.ceil(sum(chromsizes) / TILE_SIZE)
    max_zoom = int(np.ceil(np.log2(min_tile_cover)))
    tileset_info = {
        "min_pos": [0],
        "max_pos": [TILE_SIZE * 2**max_zoom],
        "max_width": TILE_SIZE * 2**max_zoom,
        "tile_size": TILE_SIZE,
        "max_zoom": max_zoom,
    }
    return tileset_info

Esempio n. 4

0

Mostra file

def get_chromsizes(bwpath):
    """TODO: replace this with negspy.

    Also, return NaNs from any missing chromosomes in bbi.fetch
    """
    chromsizes = bbi.chromsizes(bwpath)
    chromosomes = cooler.util.natsorted(chromsizes.keys())
    return pd.Series(chromsizes)[chromosomes]

Esempio n. 5

0

Mostra file

def get_chromsizes(bwpath):
    """
    TODO: replace this with negspy
    
    Also, return NaNs from any missing chromosomes in bbi.fetch
    
    """
    chromsizes = bbi.chromsizes(bwpath)
    chromosomes = natsorted(chromsizes.keys())
    chrom_series = pd.Series(chromsizes)[chromosomes]
    return chrom_series

Esempio n. 6

0

Mostra file

File: test_bbi.py Progetto: snikumbh/pybbi

def test_fetch_oob(path):
    x = bbi.fetch(path, 'chr21', -10, 1000, oob=np.nan)
    assert np.all(np.isnan(x[:10]))
    x = bbi.fetch(path, 'chr21', -10, 1000, oob=0)
    assert np.all(x[:10] == 0)

    n = bbi.chromsizes(path)['chr21']
    x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=np.nan)
    assert np.all(np.isnan(x[-10:]))
    x = bbi.fetch(path, 'chr21', n - 1000, n + 10, oob=0)
    assert np.all(x[-10:] == 0)

Esempio n. 7

0

Mostra file

File: test_bbi.py Progetto: snikumbh/pybbi

def test_chromsizes(uri):
    chromsizes = bbi.chromsizes(uri)
    assert len(chromsizes) == 1 and 'chr21' in chromsizes

Esempio n. 8

0

Mostra file

File: convert.py Progetto: higlass/clodius

def bigwigs_to_multivec(
    filepaths,
    output_file,
    assembly,
    chromsizes_filename,
    row_infos_filename,
    tile_size,
):
    with tempfile.TemporaryDirectory() as td:
        print("temporary dir:", td)

        temp_file = op.join(td, "temp.mv5")
        f_out = h5py.File(temp_file, "w")

        (chrom_info, chrom_names,
         chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, "r") as f:
                row_infos = [line.strip().encode("utf8") for line in f]

        else:
            row_infos = None

        starting_resolution = 1
        resolution = starting_resolution
        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(
                chrom,
                (
                    math.ceil(
                        chrom_info.chrom_lengths[chrom] / starting_resolution),
                    len(filepaths),
                ),
                fillvalue=np.nan,
                compression="gzip",
            )

        # Fill in data for each bigwig file.
        for bw_index, bw_file in tqdm(list(enumerate(filepaths)),
                                      desc="bigwigs"):
            if bbi.is_bigwig(bw_file):
                chromsizes = bbi.chromsizes(bw_file)
                matching_chromosomes = set(chromsizes.keys()).intersection(
                    set(chrom_names))

                # Fill in data for each resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    print("chr_name:", chr_name, resolution)
                    chr_len = chrom_info.chrom_lengths[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution),
                                 len(filepaths))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    f_out[chr_name][:, bw_index] = arr
            else:
                print(f"{bw_file} not is_bigwig")

        f_out.flush()

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, "r")

        def agg(x):
            return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(
            f_in,
            chromsizes=zip(chrom_names, chrom_lengths),
            agg=agg,
            starting_resolution=starting_resolution,
            tile_size=tile_size,
            output_file=output_file,
            row_infos=row_infos,
        )

Esempio n. 9

0

Mostra file

File: bigwig_to_multivec.py Progetto: flekschas/enhancer-gene-vis

def bigwigs_to_multivec(
    input_bigwig_files,
    output_file,
    starting_resolution
):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder(GENOME_BUILD)
    chromosomes = chromosomes[:25] # TODO: should more than chr1-chrM be used?
    chroms_length_arr = np.array([ nc.get_chrominfo('hg19').chrom_lengths[x] for x in chromosomes ], dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    num_zoom_levels = math.floor(math.log2(GENOME_LENGTH / starting_resolution))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2 ** x) for x in range(num_zoom_levels)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name, chr_shape, dtype="f4", fillvalue=np.nan, compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in enumerate(input_bigwig_files):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    num_bins = math.ceil(chr_len / resolution)
                    arr = bbi.fetch(bw_file, chr_name, 0, chr_len, num_bins, summary="sum")
                    resolutions_group[str(resolution)]["values"][chr_name][:,bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for input_bigwig_file in input_bigwig_files:
        _, filename = os.path.split(input_bigwig_file)
        name, _ = os.path.splitext(filename)
        row_infos.append({
            'id': name
        })

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()

Esempio n. 10

0

Mostra file

File: cluster_bw_to_zarr.py Progetto: keller-mark/vitessce-demo-sc-atac-seq-10x-genomics

def bigwigs_to_zarr(input_bigwig_files, output_file, starting_resolution,
                    name):
    # Short-hand for creating a DirectoryStore with a root group.
    f = zarr.open(output_file, mode='w')
    compressor = Zlib(level=1)

    num_samples = len(input_bigwig_files)

    # Create level zero groups
    chromosomes_group = f.create_group("chromosomes")

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = [str(chr_name) for chr_name in chromosomes[:25]
                   ]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_cumsum_arr = np.concatenate(
        (np.array([0]), np.cumsum(chroms_length_arr)))

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))
    chrom_name_to_cumsum = dict(zip(chromosomes, chroms_cumsum_arr))

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each chromosome dataset.
    for chr_name, chr_len in chrom_name_to_length.items():
        chr_group = chromosomes_group.create_group(chr_name)
        # Create each resolution group.
        for resolution in resolutions:
            chr_shape = (num_samples, math.ceil(chr_len / resolution))
            chr_group.create_dataset(str(resolution),
                                     shape=chr_shape,
                                     dtype="f4",
                                     fill_value=np.nan,
                                     compressor=compressor)

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (num_samples, math.ceil(chr_len / resolution))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[1],
                                    summary="sum")
                    chromosomes_group[chr_name][str(resolution)][
                        bw_index, :] = arr
        else:
            print(f"{bw_file} not is_bigwig")

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for bw_index, bw_file in enumerate(input_bigwig_files):
        row_infos.append({
            "cluster": int(bw_index + 1),
            "file": os.path.basename(bw_file)
        })

    # f.attrs should contain all tileset_info properties
    # For zarr, more attributes are used here to allow "serverless"
    f.attrs['row_infos'] = row_infos
    f.attrs['resolutions'] = sorted(resolutions, reverse=True)
    f.attrs['shape'] = [num_samples, 256]
    f.attrs['name'] = name
    f.attrs['coordSystem'] = "hg38"

    # https://github.com/zarr-developers/zarr-specs/issues/50
    f.attrs['multiscales'] = [{
        "version":
        "0.1",
        "name":
        chr_name,
        "datasets": [{
            "path": f"chromosomes/{chr_name}/{resolution}"
        } for resolution in sorted(resolutions, reverse=True)],
        "type":
        "zarr-multivec",
        "metadata": {
            "chromoffset": int(chrom_name_to_cumsum[chr_name]),
            "chromsize": int(chr_len),
        }
    } for (chr_name, chr_len) in list(zip(chromosomes, chroms_length_arr))]

Esempio n. 11

0

Mostra file

def get_stats(bigwig, bigbed, norm_vals, window_size, step_size, aggregation,
              chrom):
    base_bins = math.ceil(window_size / aggregation)

    if chrom not in bbi.chromsizes(bigwig):
        print(
            "Skipping chrom (not in bigWig file):",
            chrom,
            bbi.chromsizes(bigwig)[chrom],
        )
        return None

    chrom_size = bbi.chromsizes(bigwig)[chrom]

    intervals = np.zeros((math.ceil(
        (chrom_size - step_size) / step_size), base_bins))
    starts = np.arange(0, chrom_size - step_size, step_size)
    ends = np.append(np.arange(window_size, chrom_size, step_size), chrom_size)
    bins = window_size / aggregation

    # Extract all but the last window in one fashion (faster than `fetch`
    # with loops)
    intervals[:-1] = bbi.stackup(bigbed, [chrom] * (starts.size - 1),
                                 starts[:-1],
                                 ends[:-1],
                                 bins=bins)

    final_bins = math.ceil((ends[-1] - starts[-1]) / aggregation)
    # Extract the last window separately because it's size is likely to be
    # different from the others
    intervals[-1, :final_bins] = bbi.fetch(bigbed,
                                           chrom,
                                           starts[-1],
                                           ends[-1],
                                           bins=final_bins,
                                           missing=0.0)

    intervals = np.round(intervals).astype(int)

    # 0. Number of intevals
    # 1. Min width of peaks
    # 2. Max width of peaks
    # 3. Median width of peaks
    # 4. Min distance of peaks
    # 5. Max distance pf peaks
    # 6. Median distance of peaks
    # 7. Sum of height of peaks
    # 8. Max height of peaks
    # 9. Median height of peaks
    # 10. Median signal
    # 11. Total signal
    # 12. Peak coverage
    stats = np.zeros((norm_vals.shape[0], 13))

    stats[:, 0] = count_peaks(intervals)

    stats[:, 1] = peak_widths(intervals, np.min)
    stats[:, 2] = peak_widths(intervals, np.max)
    stats[:, 3] = peak_widths(intervals, np.median)

    stats[:, 4] = peak_distances(intervals, np.min)
    stats[:, 5] = peak_distances(intervals, np.max)
    stats[:, 6] = peak_distances(intervals, np.median)

    stats[:, 7] = peak_heights(intervals, norm_vals, stats[:, 0], np.nansum)
    stats[:, 8] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmax)
    stats[:, 9] = peak_heights(intervals, norm_vals, stats[:, 0], np.nanmedian)

    stats[:, 10] = np.median(norm_vals, axis=1)
    stats[:, 11] = np.sum(norm_vals, axis=1)
    stats[:, 12] = peak_widths(intervals, np.sum) / base_bins

    return stats, np.round(intervals).astype(int)

Esempio n. 12

0

Mostra file

File: manifest_to_mv5.py Progetto: hms-dbmi/cistrome-explorer

def bigwigs_to_multivec(input_bigwig_files, input_metadata_files, output_file,
                        starting_resolution):

    f = h5py.File(output_file, 'w')

    num_samples = len(input_bigwig_files)

    # Zip the input to create (bw, metadata) tuples
    zipped_input = zip(input_bigwig_files, input_metadata_files)

    # Create level zero groups
    info_group = f.create_group("info")
    resolutions_group = f.create_group("resolutions")
    chroms_group = f.create_group("chroms")

    # Set info attributes
    info_group.attrs['tile-size'] = 256

    # Prepare to fill in chroms dataset
    chromosomes = nc.get_chromorder('hg38')
    chromosomes = chromosomes[:25]  # TODO: should more than chr1-chrM be used?
    num_chromosomes = len(chromosomes)
    chroms_length_arr = np.array(
        [nc.get_chrominfo('hg38').chrom_lengths[x] for x in chromosomes],
        dtype="i8")
    chroms_name_arr = np.array(chromosomes, dtype="S23")

    chromosomes_set = set(chromosomes)
    chrom_name_to_length = dict(zip(chromosomes, chroms_length_arr))

    # Fill in chroms dataset entries "length" and "name"
    chroms_group.create_dataset("length", data=chroms_length_arr)
    chroms_group.create_dataset("name", data=chroms_name_arr)

    # Prepare to fill in resolutions dataset
    resolutions = [starting_resolution * (2**x) for x in range(16)]

    # Create each resolution group.
    for resolution in resolutions:
        resolution_group = resolutions_group.create_group(str(resolution))
        # TODO: remove the unnecessary "values" layer
        resolution_values_group = resolution_group.create_group("values")

        # Create each chromosome dataset.
        for chr_name, chr_len in zip(chromosomes, chroms_length_arr):
            chr_shape = (math.ceil(chr_len / resolution), num_samples)
            resolution_values_group.create_dataset(chr_name,
                                                   chr_shape,
                                                   dtype="f4",
                                                   fillvalue=np.nan,
                                                   compression='gzip')

    # Fill in data for each bigwig file.
    for bw_index, bw_file in tqdm(list(enumerate(input_bigwig_files)),
                                  desc='bigwigs'):
        if bbi.is_bigwig(bw_file):
            chromsizes = bbi.chromsizes(bw_file)
            matching_chromosomes = set(
                chromsizes.keys()).intersection(chromosomes_set)

            # Fill in data for each resolution of a bigwig file.
            for resolution in resolutions:
                # Fill in data for each chromosome of a resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    chr_len = chrom_name_to_length[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution), num_samples)
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    resolutions_group[str(
                        resolution)]["values"][chr_name][:, bw_index] = arr
        else:
            print(f"{bw_file} not is_bigwig")

        f.flush()

    f.close()

    max_mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    print(max_mem)

    # Append metadata to the top resolution row_infos attribute.
    row_infos = []
    for metadata_index, metadata_file in enumerate(input_metadata_files):
        with open(metadata_file) as mf:
            try:
                metadata_json = json.load(mf)
            except Exception as e:
                print(f"Error loading metadata file: {metadata_file}")
                print(e)
                metadata_json = None
        row_info = metadata_json_to_row_info(metadata_json)
        row_infos.append(row_info)

    row_infos_encoded = str(json.dumps(row_infos))

    f = h5py.File(output_file, 'r+')

    info_group = f["info"]
    info_group["row_infos"] = row_infos_encoded

    f.close()

Esempio n. 13

0

Mostra file

def chunk(
    bigwig,
    window_size,
    resolution,
    step_size,
    chroms,
    normalize=True,
    verbose=False,
):
    base_bins = np.ceil(window_size / resolution).astype(int)

    num_total_windows = 0
    bins = np.ceil(window_size / resolution).astype(int)

    for chrom in chroms:
        chrom_size = bbi.chromsizes(bigwig)[chrom]
        num_total_windows += np.ceil(
            (chrom_size - step_size) / step_size
        ).astype(int)

    values = np.zeros((num_total_windows, base_bins))

    start = 0
    for chrom in chroms:
        if chrom not in bbi.chromsizes(bigwig):
            print(
                "Skipping chrom (not in bigWig file):",
                chrom,
                bbi.chromsizes(bigwig)[chrom],
            )
            continue

        chrom_size = bbi.chromsizes(bigwig)[chrom]
        num_windows = np.ceil((chrom_size - step_size) / step_size).astype(int)

        start_bps = np.arange(0, chrom_size - step_size, step_size)
        end_bps = np.append(
            np.arange(window_size, chrom_size, step_size), chrom_size
        )

        end = start + num_windows

        # Extract all but the last window in one fashion (faster than `fetch`
        # with loops)
        values[start : end - 1] = bbi.stackup(
            bigwig,
            [chrom] * (start_bps.size - 1),
            start_bps[:-1],
            end_bps[:-1],
            bins=bins,
            missing=0,
        )
        final_bins = np.ceil(
            (end_bps[-1] - start_bps[-1]) / resolution
        ).astype(int)
        # Extract the last window separately because it's size is likely to be
        # different from the others
        values[end - 1, :final_bins] = bbi.fetch(
            bigwig,
            chrom,
            start_bps[-1],
            end_bps[-1],
            bins=final_bins,
            missing=0.0,
        )

        if normalize:
            values[start:end] = data.normalize(values[start:end])

        if verbose:
            print(
                "LOADING ::",
                "Chrom: {}".format(chrom),
                "| Num windows: {}".format(num_windows),
                "| Max value: {}".format(np.max(values[start:end])),
            )

    return values