Ejemplo n.º 1
0
def _multivec(filepath,
              output_file,
              assembly,
              tile_size,
              chromsizes_filename,
              starting_resolution,
              row_infos_filename=None):
    '''
    Aggregate a multivec file.

    This is a file containing nxn data that is aggregated along only one axis.
    This data should be in an HDF5 file where each dataset is named for a
    chromosome and contains a 'resolutions' group containing values for the
    base level resolution.

    Example: f['chr1']['reslutions']['1000'] = [[1,2,3],[4,5,6]]

    The resulting data will be organized by resolution and chromosome.

    Example: f_out['chr1']['resolutions']['5000']=[[1000,2000,3000],[4000,5000,6000]]

    Aggregation is currently done by summing adjacent values.
    '''
    f_in = h5py.File(filepath, 'r')

    if output_file is None:
        output_file = op.splitext(filepath)[0] + ".multires.mv5"

    (chrom_info, chrom_names,
     chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)

    if method == 'maxtotal':
        pass
    if method == 'logsumexp':

        def agg(x):
            a = x.T.reshape((x.shape[1], -1, 2))
            return sm.logsumexp(a, axis=2).T
    else:
        agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

    print("agg:", agg)
    if row_infos_filename is not None:
        with open(row_infos_filename, 'r') as fr:
            row_infos = [l.strip().encode('utf8') for l in fr]
    else:
        row_infos = None
    print("row_infos:", row_infos)

    cmv.create_multivec_multires(
        f_in,
        chromsizes=zip(chrom_names, chrom_sizes),
        agg=lambda x: np.nansum(x.T.reshape((x.shape[1], -1, 2)), axis=2).T,
        starting_resolution=starting_resolution,
        tile_size=tile_size,
        output_file=output_file,
        row_infos=row_infos)
Ejemplo n.º 2
0
def _bedgraph_to_multivec(filepaths, output_file, assembly, chrom_col,
                          from_pos_col, to_pos_col, value_col, has_header,
                          chunk_size, nan_value, chromsizes_filename,
                          starting_resolution, num_rows, format,
                          row_infos_filename, tile_size, method):
    print('chrom_col:', chrom_col)

    with tempfile.TemporaryDirectory() as td:
        print('temporary dir:', td)

        temp_file = op.join(td, 'temp.mv5')
        f_out = h5py.File(temp_file, 'w')

        (chrom_info, chrom_names,
         chrom_sizes) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, 'r') as fr:
                row_infos = [l.strip().encode('utf8') for l in fr]

        else:
            row_infos = None

        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(chrom, (math.ceil(
                chrom_info.chrom_lengths[chrom] / starting_resolution),
                                         num_rows * len(filepaths)),
                                 fillvalue=np.nan,
                                 compression='gzip')

        def bedline_to_chrom_start_end_vector(bedlines, row_infos=None):
            chrom_set = set()
            start_set = set()
            end_set = set()
            all_vector = []

            for bedline in bedlines:
                parts = bedline.strip().split()
                chrom = parts[chrom_col - 1]
                start = int(parts[from_pos_col - 1])
                end = int(parts[to_pos_col - 1])
                vector = [
                    float(f) if not f == 'NA' else np.nan
                    for f in parts[value_col - 1:value_col - 1 + num_rows]
                ]
                chrom_set.add(chrom)
                start_set.add(start)
                end_set.add(end)

                if len(chrom_set) > 1:
                    raise ValueError("Chromosomes don't match in these lines:",
                                     bedlines)
                if len(start_set) > 1:
                    raise ValueError(
                        "Start positions don't match in these lines:",
                        bedlines)
                if len(end_set) > 1:
                    raise ValueError(
                        "End positions don't match in these lines:", bedlines)
                all_vector += vector

            return (list(chrom_set)[0], list(start_set)[0], list(end_set)[0],
                    all_vector)

        if format == 'epilogos':
            cmv.bedfile_to_multivec(filepaths, f_out,
                                    epilogos_bedline_to_vector,
                                    starting_resolution, has_header,
                                    chunk_size)
        elif format == 'states':
            assert (
                row_infos != None
            ), "A row_infos file must be provided for --format = 'states' "
            states_dic = {row_infos[x]: x for x in range(len(row_infos))}

            cmv.bedfile_to_multivec(filepaths, f_out, states_bedline_to_vector,
                                    starting_resolution, has_header,
                                    chunk_size, states_dic)
        else:
            cmv.bedfile_to_multivec(filepaths, f_out,
                                    bedline_to_chrom_start_end_vector,
                                    starting_resolution, has_header,
                                    chunk_size)

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, 'r')

        if output_file is None:
            output_file = op.splitext(filepaths[0])[0] + '.multires.mv5'
        print('output_file:', output_file)

        # Override the output file if it existts
        if op.exists(output_file):
            os.remove(output_file)

        if method == 'logsumexp':

            def agg(x):
                # newshape = (x.shape[2], -1, 2)
                # b = x.T.reshape((-1,))

                a = x.T.reshape((x.shape[1], -1, 2))

                # this is going to be an odd way to get rid of nan
                # values
                orig_shape = a.shape
                na = a.reshape((-1, ))

                SMALL_NUM = -1e8
                NAN_THRESHOLD_NUM = SMALL_NUM / 100

                if np.nanmin(na) < NAN_THRESHOLD_NUM:
                    raise ValueError(
                        "Error removing nan's when running logsumexp aggregation"
                    )

                na[np.isnan(na)] = SMALL_NUM
                na = na.reshape(orig_shape)
                res = sm.logsumexp(a, axis=2).T

                nres = res.reshape((-1, ))
                # print("nres:", np.nansum(nres < NAN_THRESHOLD_NUM))
                nres[nres < NAN_THRESHOLD_NUM] = np.nan
                res = nres.reshape(res.shape)

                # print("res:", np.nansum(res.reshape((-1,))))

                return res
        else:
            agg = lambda x: x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(f_in,
                                     chromsizes=zip(chrom_names, chrom_sizes),
                                     agg=agg,
                                     starting_resolution=starting_resolution,
                                     tile_size=tile_size,
                                     output_file=output_file,
                                     row_infos=row_infos)
Ejemplo n.º 3
0
def bigwigs_to_multivec(
    filepaths,
    output_file,
    assembly,
    chromsizes_filename,
    row_infos_filename,
    tile_size,
):
    with tempfile.TemporaryDirectory() as td:
        print("temporary dir:", td)

        temp_file = op.join(td, "temp.mv5")
        f_out = h5py.File(temp_file, "w")

        (chrom_info, chrom_names,
         chrom_lengths) = cch.load_chromsizes(chromsizes_filename, assembly)

        if row_infos_filename is not None:
            with open(row_infos_filename, "r") as f:
                row_infos = [line.strip().encode("utf8") for line in f]

        else:
            row_infos = None

        starting_resolution = 1
        resolution = starting_resolution
        for chrom in chrom_info.chrom_order:
            f_out.create_dataset(
                chrom,
                (
                    math.ceil(
                        chrom_info.chrom_lengths[chrom] / starting_resolution),
                    len(filepaths),
                ),
                fillvalue=np.nan,
                compression="gzip",
            )

        # Fill in data for each bigwig file.
        for bw_index, bw_file in tqdm(list(enumerate(filepaths)),
                                      desc="bigwigs"):
            if bbi.is_bigwig(bw_file):
                chromsizes = bbi.chromsizes(bw_file)
                matching_chromosomes = set(chromsizes.keys()).intersection(
                    set(chrom_names))

                # Fill in data for each resolution of a bigwig file.
                for chr_name in matching_chromosomes:
                    print("chr_name:", chr_name, resolution)
                    chr_len = chrom_info.chrom_lengths[chr_name]
                    chr_shape = (math.ceil(chr_len / resolution),
                                 len(filepaths))
                    arr = bbi.fetch(bw_file,
                                    chr_name,
                                    0,
                                    chr_len,
                                    chr_shape[0],
                                    summary="sum")
                    f_out[chr_name][:, bw_index] = arr
            else:
                print(f"{bw_file} not is_bigwig")

        f_out.flush()

        f_out.close()
        tf = temp_file
        f_in = h5py.File(tf, "r")

        def agg(x):
            return x.T.reshape((x.shape[1], -1, 2)).sum(axis=2).T

        cmv.create_multivec_multires(
            f_in,
            chromsizes=zip(chrom_names, chrom_lengths),
            agg=agg,
            starting_resolution=starting_resolution,
            tile_size=tile_size,
            output_file=output_file,
            row_infos=row_infos,
        )