コード例 #1
0
def createCooler(pSparseMatrix, pChromosome, pChromSize, pResolution, pOutfile,
                 pMetadata):
    #get indices of upper triangular matrix
    triu_Indices = np.triu_indices(pSparseMatrix.shape[0])

    #create the bins for cooler
    bins = pd.DataFrame(columns=['chrom', 'start', 'end'])
    binStartList = list(range(0, pChromSize, int(pResolution)))
    binEndList = list(range(int(pResolution), pChromSize, int(pResolution)))
    binEndList.append(pChromSize)
    bins['start'] = binStartList
    bins['end'] = binEndList
    bins['chrom'] = str(pChromosome)

    #create the pixels for cooler
    pixels = pd.DataFrame(columns=['bin1_id', 'bin2_id', 'count'])
    pixels['bin1_id'] = triu_Indices[0]
    pixels['bin2_id'] = triu_Indices[1]
    readCounts = np.array(pSparseMatrix[triu_Indices])[0]
    pixels['count'] = np.float64(readCounts)
    pixels.sort_values(by=['bin1_id', 'bin2_id'], inplace=True)
    #write out the cooler
    cooler.create_cooler(pOutfile,
                         bins=bins,
                         pixels=pixels,
                         dtypes={'count': np.float64},
                         metadata=pMetadata)
コード例 #2
0
ファイル: test_io.py プロジェクト: ngocemy/chromosight
    def test_load_cool(self):
        """Test loading of matrices in cool format"""

        # Write a dummy bedgraph2 (basically a diagonal)
        res, n_bins = 5000, 100000
        chrom_names = ["c1", "c2", "c3"]
        bins_per_chrom = [n_bins // 3, n_bins // 3, n_bins // 3 + n_bins % 3]
        bins = pd.DataFrame(
            {
                "chrom": np.repeat(chrom_names, bins_per_chrom),
                "start": range(0, res * (n_bins), res),
                "end": range(res, res * (n_bins + 1), res),
            }
        )
        pixels = pd.DataFrame(
            {
                "bin1_id": range(n_bins),
                "bin2_id": range(n_bins),
                "count": np.random.randint(0, 100, n_bins),
            }
        )

        # Save dataframes into a cool file using cool API
        cooler.create_cooler(self.tmp_path, bins, pixels)

        # Load cool and check whether it was parsed correctly
        mat, chroms, bins, bin_size = cio.load_cool(self.tmp_path)

        # Median should work to estimate resolution id nbins >> nchroms
        assert res == abs(int(np.nanmedian(bins.start.shift(1) - bins.start)))
        assert res == bin_size
        assert n_bins == bins.shape[0]
        assert np.all(bins.columns == BIN_COLS)
        assert np.all(chroms.columns == CHR_COLS)
        assert mat.sum() == pixels["count"].sum()
コード例 #3
0
ファイル: threshold.py プロジェクト: luckchem/GiniQC
def get_threshold(combos, cull_by_cis, bedfile):
    reads = {}
    rawgini = {}
    adjustedgini = {}
    bins_df = make_df(bedfile)

    if cull_by_cis:
        cis_threshold = int(sys.argv[3])

    for pair in tqdm(combos):
        if pair[0] == pair[1]:
            continue
        pair = tuple(pair)
        cool1, cool2 = get_cools(pair)
        try:
            matrix1 = np.array(cool1.matrix(as_pixels=True, balance=False)[:])
            matrix2 = np.array(cool2.matrix(as_pixels=True, balance=False)[:])
        except:
            continue
        numreads1 = sum(matrix1[:, -1])
        numreads2 = sum(matrix2[:, -1])
        totalreads = numreads1 + numreads2

        if cull_by_cis and (calculate_cistrans(matrix1) < cis_threshold
                            or calculate_cistrans(matrix2) < cis_threshold):
            continue

        if numreads1 == 0 or numreads2 == 0 or totalreads < 50000:
            continue

        numtoselect = int(
            abs(np.random.normal(totalreads / 2, totalreads / 20)))

        rands = np.random.choice(np.arange(1, totalreads),
                                 numtoselect,
                                 replace=False)
        rands.sort()

        pixel_df = fill_pixel_df(rands, matrix1, matrix2, numreads1, numreads2)
        cooler.create_cooler("temp.cool",
                             bins=bins_df,
                             pixels=pixel_df,
                             dtypes={
                                 'bin1_id': int,
                                 'bin2_id': int,
                                 'count': int
                             },
                             ordered=True)
        newcool = cooler.Cooler("temp.cool")

        normalized, reads[pair], cis, trans = normalize_matrix(newcool)
        rawgini[pair] = gini(normalized)
        adjustedgini[pair] = adjust(rawgini[pair], reads[pair])
        os.unlink("temp.cool")

    return reads, rawgini, adjustedgini
コード例 #4
0
def gather_high_low_cool(
        cooler_file='Rao2014-GM12878-DpnII-allreps-filtered.10kb.cool',
        path='./data/raw/',
        chromosome='22',
        scale=4,
        output_path='./experiment/evaluation/'):
    file = os.path.join(path, cooler_file)
    cool_hic = cooler.Cooler(file)
    resolution = cool_hic.binsize
    mat = cool_hic.matrix(balance=True).fetch('chr' + chromosome)
    high_hic, idx = remove_zeros(
        mat)  # idx: {true, false}, len is not changed/shrinked
    bool_idx = np.array(idx).flatten()
    num_idx = np.array(np.where(idx)).flatten()
    low_hic = sampling_hic(high_hic, scale**2, fix_seed=True)
    print('high hic shape: {}.'.format(high_hic.shape), end=' ')
    print('low hic shape: {}.'.format(low_hic.shape))

    b = {
        'chrom': ['chr{}'.format(chromosome)] * len(bool_idx),
        'start': resolution * np.arange(len(bool_idx)),
        'end': resolution * (np.arange(1, (len(bool_idx) + 1))),
        'weight': 1.0 * bool_idx
    }
    bins = pd.DataFrame(data=b)

    high_hic = ICE_normalization(high_hic)
    low_hic = ICE_normalization(low_hic)

    high_hic = triu(high_hic, format='coo')
    low_hic = triu(low_hic, format='coo')

    output_path = os.path.join(output_path, 'chr{}'.format(chromosome))
    os.makedirs(output_path, exist_ok=True)

    outfile = 'high_chr{}.cool'.format(chromosome)
    print('saving file {}'.format(os.path.join(output_path, outfile)))
    uri = os.path.join(output_path, outfile)
    p = {
        'bin1_id': num_idx[high_hic.row],
        'bin2_id': num_idx[high_hic.col],
        'count': high_hic.data
    }
    pixels = pd.DataFrame(data=p)
    cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)

    outfile = 'low_chr{}.cool'.format(chromosome)
    print('saving file {}'.format(os.path.join(output_path, outfile)))
    uri = os.path.join(output_path, outfile)
    p = {
        'bin1_id': num_idx[low_hic.row],
        'bin2_id': num_idx[low_hic.col],
        'count': low_hic.data
    }
    pixels = pd.DataFrame(data=p)
    cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
コード例 #5
0
def generate_cool(input_path='./experiment/significant_interactions', chromosomes=['22', '21', '20', '19', 'X'], resolution=10000, genomic_distance=2000000):
    k = np.ceil(genomic_distance/resolution).astype(int)
    for chro in chromosomes:
        path = os.path.join(input_path, 'chr{}'.format(chro))
        hicfile = 'sample_high_chr{}.cool'.format(chro)
        cool_hic = cooler.Cooler(os.path.join(path, hicfile))
        mat = cool_hic.matrix(balance=True).fetch('chr' + chro)
        bins = cool_hic.bins().fetch('chr' + chro)
        num_idx = np.array(np.where(np.array(bins['weight']))).flatten()

        high_mat = mat[num_idx, :]
        high_mat = high_mat[:, num_idx]
        high_mat = filter_diag_boundary(high_mat, diag_k=0, boundary_k=k)

        files = [f for f in os.listdir(path) if '.npz' in f]
        for file in files:
            if 'high' in file or 'low' in file:
                continue
            print(file)
            data = np.load(os.path.join(path, file), allow_pickle=True)
            mat = data['hic']
            namelist = file.split('_')
            if len(namelist) == 3:
                name = namelist[0]
            else:
                model = namelist[1]
                win_len = namelist[3]
                if model == 'hicgan':
                    # true_hic = np.log1p(true_hic)
                    mat = np.expm1(mat)
                elif model == 'deephic':
                    minv = high_mat.min()
                    maxv = high_mat.max()
                    # true_hic = np.divide((true_hic-minv), (maxv-minv), dtype=float,out=np.zeros_like(true_hic), where=(maxv-minv) != 0)
                    mat = mat*(maxv-minv)+minv
                    mat = (mat+np.transpose(mat))/2
                elif model == 'hicsr':
                    log_mat = np.log2(high_mat+1)
                    # ture_hic = 2*(log_mat/np.max(log_mat)) - 1
                    maxv = np.max(log_mat)
                    log_predict_hic = (mat+1)/2*maxv
                    mat = np.expm1(log_predict_hic)
                '''elif model == 'ours':
                    scn, dh = scn_normalization(high_mat, max_iter=3000)
                    mat = scn_recover(mat, dh)'''
                name = '_'.join([model, win_len])
            mat = filter_diag_boundary(mat, diag_k=0, boundary_k=k)
            # mat = ICE_normalization(mat)
            print('{} matrix shape: {}'.format(name, mat.shape))
            uri = os.path.join(path, 'sample_{}_chr{}.cool'.format(name, chro))
            mat = triu(mat, format='coo')
            # p = {'bin1_id': mat.row, 'bin2_id': mat.col, 'count': mat.data}
            p = {'bin1_id': num_idx[mat.row], 'bin2_id': num_idx[mat.col], 'count': mat.data}
            pixels = pd.DataFrame(data = p)
            cooler.create_cooler(cool_uri=uri, bins=bins, pixels=pixels)
コード例 #6
0
def save_cool(cool_out, mat, frags, metadata={}):
    """
    Writes a .cool file from graal style tables.
    
    Parameters
    ----------
    cool_out : str
        Path to the output cool file.
    mat : scipy coo_matrix
        The Hi-C contact matrix in sparse COO format.
    frags : pandas DataFrame
        The graal style 'fragments_list' table.
    metadata : dict
        Potential metadata to associate with the cool file.
    """
    up_tri = False
    # Check if symmetric matrix is symmetric
    # (i.e. only upper triangle or full mat)
    if (abs(mat - mat.T) > 1e-10).nnz != 0:
        up_tri = True
    # Drop useless column
    try:
        bins = frags.drop("id", axis=1)
    except KeyError:
        bins = frags
    # Get column names right
    bins.rename(
        columns={
            "seq": "chrom",
            "start_pos": "start",
            "end_pos": "end"
        },
        inplace=True,
    )
    mat_dict = {"bin1_id": mat.row, "bin2_id": mat.col, "count": mat.data}
    pixels = pd.DataFrame(mat_dict)
    cooler.create_cooler(  # pylint: disable=undefined-variable
        cool_out,
        bins,
        pixels,
        metadata=metadata,
        symmetric_upper=up_tri,
        triucheck=False,
    )
コード例 #7
0
    def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
        log.debug('Save in cool format')

        bins_data_frame, matrix_data_frame, dtype_pixel, info = self.create_cooler_input(
            pSymmetric=pSymmetric, pApplyCorrection=pApplyCorrection)
        local_temp_dir = os.path.dirname(os.path.realpath(pFileName))
        cooler.create_cooler(cool_uri=pFileName,
                             bins=bins_data_frame,
                             pixels=matrix_data_frame,
                             mode=self.appendData,
                             dtypes=dtype_pixel,
                             ordered=True,
                             metadata=self.hic_metadata,
                             temp_dir=local_temp_dir)

        if self.appendData == 'w':
            fileName = pFileName.split('::')[0]
            with h5py.File(fileName, 'r+') as h5file:
                h5file.attrs.update(info)
                h5file.close()
コード例 #8
0
    def to_cooler(self, store, normalise=False, **normalise_options):

        capture_bins = self.capture_bins
        capture_name = self.cooler.info["metadata"]["capture_name"]
        capture_coords = self.cooler.info["metadata"]["capture_coords"]
        capture_chrom = self.cooler.info["metadata"]["capture_chrom"]

        metadata = {
            "capture_bins": [int(x) for x in self.capture_bins],
            "capture_name": capture_name,
            "capture_coords": capture_coords,
            "capture_chrom": capture_chrom,
            "n_cis_interactions": self.n_cis_interactions,
        }

        if normalise:
            self.normalise_pixels(**normalise_options)

        if os.path.exists(
                store):  # Will append to a prexisting file if one is supplied
            cooler_fn = f"{store}::/{capture_name}/resolutions/{self.binsize}"
        else:
            cooler_fn = (
                f"{store.replace('.hdf5', '')}.{capture_name}.{self.binsize}.hdf5"
            )

        cooler.create_cooler(
            cooler_fn,
            bins=self.bins,
            pixels=self.pixels,
            metadata=metadata,
            mode="w" if not os.path.exists(store) else "a",
            columns=self.pixels.columns[2:],
        )

        return cooler_fn
コード例 #9
0
def createCoolersFromDf(pResultsDf, pResolution, pPredictionOutfile, pTargetOutfile):
    #create the bins for cooler
    bins = pd.DataFrame(columns=['chrom','start','end'])
    maxPos = max(pResultsDf['bin1_id'].max(), pResultsDf['bin2_id'].max()) * pResolution + pResolution
    minPos = 0
    binStartList = list(range(minPos, maxPos, pResolution))
    binEndList = list(range(minPos + pResolution, maxPos, pResolution))
    binEndList.append(maxPos)
    bins['start'] = binStartList
    bins['end'] = binEndList
    bins['chrom'] = pResultsDf.loc[0, 'chromosome'] 
    #create the pixels / counts for predicted cooler
    pixels = pd.DataFrame(columns=['bin1_id','bin2_id','count'])
    pixels['bin1_id'] = pResultsDf['bin1_id']
    pixels['bin2_id'] = pResultsDf['bin2_id']
    pixels['count'] = pResultsDf['PredictedValue']
    pixels.sort_values(by=['bin1_id','bin2_id'],inplace=True)  
    #create the pixels / counts for target cooler
    targetPixels = pixels.copy(deep=True)
    targetPixels['count'] = pResultsDf['TrueValue']
    targetPixels.sort_values(by=['bin1_id','bin2_id'],inplace=True)
    #store the coolers
    cooler.create_cooler(pPredictionOutfile, bins=bins, pixels=pixels, dtypes={'count': np.float64})
    cooler.create_cooler(pTargetOutfile, bins=bins, pixels=targetPixels, dtypes={'count': np.float64})
コード例 #10
0
def create_cool(bins, pixels, resolution, cool_file, genome_assembly):
    metadata = {
        'format': 'HDF5::Cooler',
        'format-version': '0.8.10',
        'bin-type': 'fixed',
        'bin-size': resolution,
        'storage-mode': 'symmetric-upper',
        'genome-assembly': genome_assembly,
        'generated-by': 'boost-hic',
        # 'creation-date': datetime.date.today()
    }

    count_dtypes = {'count': 'float64'}
    bins = bins.astype({'chrom': str, 'start': int, 'end': int})
    pixels = pixels.astype({'bin1_id': int, 'bin2_id': int, 'count': float})
    cooler.create_cooler(cool_file,
                         bins=bins,
                         pixels=pixels,
                         dtypes=count_dtypes,
                         ordered=True,
                         ensure_sorted=True,
                         metadata=metadata)
    # cooler.create_cooler(cool_file, bins=bins, pixels=pixels, dtypes=count_dtypes, ordered=True, metadata=metadata)
    return cool_file
コード例 #11
0
ファイル: test_creation.py プロジェクト: zhang-jiankun/cooler
def test_create_custom_cols():

    with isolated_filesystem():
        df = pd.DataFrame(
            {
                "bin1_id": [0, 1, 1, 1, 2, 2, 3, 4, 5],
                "bin2_id": [1, 1, 3, 4, 5, 6, 7, 8, 9],
                "foo": [1, 1, 1, 1, 1, 2, 2, 2, 2],
                "bar": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
            },
            columns=["bin1_id", "bin2_id", "foo", "bar"],
        )
        bins = pd.DataFrame({
            "chrom": ["chr1"] * 5 + ["chr2"] * 5,
            "start": list(range(5)) * 2,
            "end": list(range(1, 6)) * 2,
        })
        # works in unordered mode
        cooler.create_cooler("test.cool", bins, df, columns=["foo", "bar"])
        clr = cooler.Cooler("test.cool")
        assert len(clr.pixels().columns) == 4
        assert np.allclose(
            df,
            clr.pixels()[["bin1_id", "bin2_id", "foo", "bar"]][:])

        # works in ordered mode
        cooler.create_cooler("test.cool",
                             bins,
                             df,
                             columns=["foo", "bar"],
                             ordered=True)
        clr = cooler.Cooler("test.cool")
        assert len(clr.pixels().columns) == 4
        assert np.allclose(
            df,
            clr.pixels()[["bin1_id", "bin2_id", "foo", "bar"]][:])

        # raises if no custom columns specified and 'count' does not exist
        with pytest.raises(ValueError):
            cooler.create_cooler("test.cool",
                                 bins,
                                 df,
                                 columns=None,
                                 ordered=True)
コード例 #12
0
def test_create_custom_cols():

    with isolated_filesystem() as fs:
        df = pd.DataFrame(
            {
                'bin1_id': [0, 1, 1, 1, 2, 2, 3, 4, 5],
                'bin2_id': [1, 1, 3, 4, 5, 6, 7, 8, 9],
                'foo': [1, 1, 1, 1, 1, 2, 2, 2, 2],
                'bar': [.1, .2, .3, .4, .5, .6, .7, .8, .9],
            },
            columns=['bin1_id', 'bin2_id', 'foo', 'bar'])
        bins = pd.DataFrame({
            'chrom': ['chr1'] * 5 + ['chr2'] * 5,
            'start': list(range(5)) * 2,
            'end': list(range(1, 6)) * 2,
        })
        # works in unordered mode
        cooler.create_cooler('test.cool', bins, df, columns=['foo', 'bar'])
        clr = cooler.Cooler('test.cool')
        assert len(clr.pixels().columns) == 4
        assert np.allclose(
            df,
            clr.pixels()[['bin1_id', 'bin2_id', 'foo', 'bar']][:])

        # works in ordered mode
        cooler.create_cooler('test.cool',
                             bins,
                             df,
                             columns=['foo', 'bar'],
                             ordered=True)
        clr = cooler.Cooler('test.cool')
        assert len(clr.pixels().columns) == 4
        assert np.allclose(
            df,
            clr.pixels()[['bin1_id', 'bin2_id', 'foo', 'bar']][:])

        # raises if no custom columns specified and 'count' does not exist
        with pytest.raises(ValueError):
            cooler.create_cooler('test.cool',
                                 bins,
                                 df,
                                 columns=None,
                                 ordered=True)
コード例 #13
0
ファイル: test_cli.py プロジェクト: xtmgah/cooler
def test_dump():
    runner = CliRunner()
    with runner.isolated_filesystem():
        f_in = op.join(datadir, 'toy.symm.upper.2.cool')
        result = runner.invoke(dump, [
            f_in,
        ])
        assert result.exit_code == 0

        # roundtrip symm-upper data
        bins = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '-H', '-t', 'bins']).output),
                           sep='\t')
        pixels = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '-H']).output),
                             sep='\t')
        cooler.create_cooler('out.cool', bins, pixels, symmetric_upper=True)
        cooler_cmp(f_in, 'out.cool')

        # duplexed output
        pixels2 = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '--matrix', '-H']).output),
                              sep='\t')
        assert len(pixels2) > len(pixels)
        upper = pixels2[pixels2['bin1_id'] <= pixels2['bin2_id']].reset_index(
            drop=True)
        assert np.allclose(pixels, upper)

        # lower triangle
        trans_lower = pd.read_csv(StringIO(
            runner.invoke(dump,
                          [f_in, '-H', '-r', 'chr2', '-r2', 'chr1']).output),
                                  sep='\t')
        assert len(trans_lower) == 0
        trans_lower = pd.read_csv(StringIO(
            runner.invoke(
                dump, [f_in, '-m', '-H', '-r', 'chr2', '-r2', 'chr1']).output),
                                  sep='\t')
        assert len(trans_lower) > 0

        # roundtrip square data
        f_in = op.join(datadir, 'toy.asymm.2.cool')
        bins = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '-H', '-t', 'bins']).output),
                           sep='\t')
        pixels = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '-H']).output),
                             sep='\t')
        cooler.create_cooler('out.cool', bins, pixels, symmetric_upper=False)
        cooler_cmp(f_in, 'out.cool')
        pixels2 = pd.read_csv(StringIO(
            runner.invoke(dump, [f_in, '--matrix', '-H']).output),
                              sep='\t')
        assert np.allclose(pixels, pixels2)

        # for square data, -m is a no-op
        lower1 = pd.read_csv(StringIO(
            runner.invoke(dump,
                          [f_in, '-H', '-r', 'chr2', '-r2', 'chr1']).output),
                             sep='\t')
        lower2 = pd.read_csv(StringIO(
            runner.invoke(
                dump, [f_in, '-m', '-H', '-r', 'chr2', '-r2', 'chr1']).output),
                             sep='\t')
        assert np.allclose(lower1, lower2)
コード例 #14
0
ファイル: utilities.py プロジェクト: yluan91/HiCPeaks
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtypes=None, mergebuf=int(20e6),
                         delete_temp=True, temp_dir=None, **kwargs):
    """
    Create a Cooler in two passes via an external sort mechanism. In the first 
    pass, a sequence of data chunks are processed and sorted in memory and saved
    to temporary Coolers. In the second pass, the temporary Coolers are merged 
    into the output. This way the individual chunks do not need to be provided
    in any particular order.
    
    Parameters
    ----------
    cool_uri : str
        Path to Cooler file or URI to Cooler group. If the file does not exist,
        it will be created.
    bins : DataFrame
        Segmentation of the chromosomes into genomic bins. May contain 
        additional columns.
    chunks : iterable of DataFrames
        Sequence of chunks that get processed and written to separate Coolers 
        and then subsequently merged.
    columns : sequence of str, optional
        Specify here the names of any additional value columns from the input 
        besides 'count' to store in the Cooler. The standard columns ['bin1_id', 
        'bin2_id', 'count'] can be provided, but are already assumed and don't 
        need to be given explicitly. Additional value columns provided here will 
        be stored as np.float64 unless otherwised specified using `dtype`.
    dtypes : dict, optional
        Dictionary mapping column names to dtypes. Can be used to override the
        default dtypes of ``bin1_id``, ``bin2_id`` or ``count`` or assign
        dtypes to custom value columns. Non-standard value columns given in
        ``dtypes`` must also be provided in the ``columns`` argument or they
        will be ignored.
    assembly : str, optional
        Name of genome assembly.
    mode : {'w' , 'a'}, optional [default: 'w']
        Write mode for the output file. 'a': if the output file exists, append
        the new cooler to it. 'w': if the output file exists, it will be
        truncated. Default is 'w'.
    metadata : dict, optional
        Experiment metadata to store in the file. Must be JSON compatible.
    mergebuf : int, optional
        Maximum number of records to buffer in memory at any give time during 
        the merge step.
    delete_temp : bool, optional
        Whether to delete temporary files when finished. 
        Useful for debugging. Default is False.
    temp_dir : str, optional
        Create temporary files in this directory.

    See also
    --------
    sanitize_records
    sanitize_pixels

    """
    bins = bins.copy()
    bins['chrom'] = bins['chrom'].astype(object)

    tf = tempfile.NamedTemporaryFile(
                suffix='.multi.cool', 
                delete=delete_temp,
                dir=temp_dir)
        
    uris = []
    for i, chunk in enumerate(chunks):
        uri = tf.name + '::' + str(i)
        uris.append(uri)
        log.info('Writing chunk {}: {}'.format(i, uri))
        create_cooler(uri, bins, chunk, columns=columns, mode='a', boundscheck=False,
                      triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True,
                      dtypes=dtypes)
        
    chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf)

    log.info('Merging into {}'.format(cool_uri))
    create_cooler(cool_uri, bins, chunks, columns=columns, dtypes=dtypes, ordered=True,
                  **kwargs)
コード例 #15
0
ファイル: utilities.py プロジェクト: yluan91/HiCPeaks
    def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#','X'], onlyIntra=True,
        dtype='int'):

        self.outfil = os.path.abspath(os.path.expanduser(outfil))
        if os.path.exists(self.outfil):
            log.error('Cooler file {} already exists, exit ...'.format(self.outfil))
            sys.exit(1)
        self.chroms = set(chroms)
        self.onlyIntra = onlyIntra
        data = datasets

        ## Ready for data loading
        if not chromsizes_file is None:
            chromsizes_path = os.path.abspath(os.path.expanduser(chromsizes_file))
            log.info('Read chromosome sizes from {}'.format(chromsizes_path))
            chromsizes = readChromSizes(chromsizes_path, self.chroms)
        else:
            log.info('Fetch chromosome sizes from UCSC ...')
            chromsizes = fetchChromSizes(assembly, self.chroms)
        chromlist = chromsizes.keys()
        # sort chromosome labels
        tmp = list(map(str, sorted(map(int, [i for i in chromlist if i.isdigit()]))))
        nondigits = [i for i in chromlist if not i.isdigit()]
        for i in ['X','Y','M']:
            if i in nondigits:
                tmp.append(nondigits.pop(nondigits.index(i)))
        chromlist = tmp + sorted(nondigits)
        lengths = [chromsizes[i] for i in chromlist]
        self.chromsizes = pd.Series(data=lengths, index=chromlist)
        log.info('Done')

        ## We don't read data into memory at this point.
        ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels.
        self.Map = {}
        for res in data:
            if data[res].endswith('.npz'):
                self.Map[res] = {}
                lib = np.load(data[res])
                for i in lib.files:
                    if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)):
                        # Compatible with TADLib and old version of runHiC
                        c1 = c2 = i
                        self.Map[res][(c1,c2)] = lib
                    else:
                        tmp = i.split('_')
                        if len(tmp)!=2:
                            continue
                        c1, c2 = tmp
                        check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms))
                        check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms))
                        if check1 and check2:
                            self.Map[res][(c1,c2)] = lib
            else:
                self.Map[res] = self._scanFolder(data[res])

        self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'],
                                    'formats':[np.int, np.int, np.float]})
        
        log.info('Extract and save data into cooler format for each resolution ...')
        for res in self.Map:
            log.info('Current resolution: {}bp'.format(res))
            byres = self.Map[res]
            # Extract parts of chromsizes
            subset = []
            for c1, c2 in byres:
                subset.extend([c1,c2])
            subset = set(subset)
            Bool = [(i in subset) for i in self.chromsizes.index]
            chromsizes = self.chromsizes[Bool]
            bin_cumnums = self.binCount(chromsizes, res)
            log.info('Generate bin table ...')
            bintable = binnify(chromsizes, res)
            pixels = self._generator(byres, chromsizes, bin_cumnums)
            if os.path.exists(self.outfil):
                mode = 'a'
            else:
                mode = 'w'
            if dtype == 'int':
                dtypes = {'count': np.int32}
            else:
                dtypes = {'count': np.float64}
            cooler_uri = '{}::{}'.format(self.outfil, res)
            if self.onlyIntra:
                create_cooler(cooler_uri, bintable, pixels, assembly=assembly, mode=mode,
                       boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False,
                       ordered=True, metadata={'onlyIntra':str(self.onlyIntra)}, dtypes=dtypes)
            else:
                create_from_unordered(cooler_uri, bintable, pixels, assembly=assembly,
                                      mode=mode, metadata={'onlyIntra':str(self.onlyIntra)},
                                      delete_temp=True, boundscheck=False, triucheck=False,
                                      dupcheck=False, ensure_sorted=False, dtypes=dtypes)
コード例 #16
0
ファイル: cool.py プロジェクト: bitfan/HiCMatrix
    def save(self, pFileName, pSymmetric=True, pApplyCorrection=True):
        log.debug('Save in cool format')

        self.matrix.eliminate_zeros()

        if self.nan_bins is not None and len(self.nan_bins) > 0 and self.fileWasH5:
            # remove nan_bins
            correction_factors = np.ones(self.matrix.shape[0])
            correction_factors[self.nan_bins] = 0
            self.matrix.sort_indices()
            _instances, _features = self.matrix.nonzero()

            instances_factors = correction_factors[_instances]
            features_factors = correction_factors[_features]

            instances_factors = np.logical_not(np.logical_or(instances_factors, features_factors))
            self.matrix.data[instances_factors] = 0
            self.matrix.eliminate_zeros()

        # set possible nans in data to 0
        mask = np.isnan(self.matrix.data)

        self.matrix.data[mask] = 0
        self.matrix.eliminate_zeros()
        # save only the upper triangle of the
        if pSymmetric:
            # symmetric matrix
            self.matrix = triu(self.matrix, format='csr')
        else:
            self.matrix = self.matrix

        self.matrix.eliminate_zeros()

        # create data frame for bins
        # self.cut_intervals is having 4 tuples, bin_data_frame should have 3.correction_factors
        # it looks like it is faster to create it with 4, and drop the last one
        # instead of handling this before.
        bins_data_frame = pd.DataFrame(self.cut_intervals, columns=['chrom', 'start', 'end', 'interactions']).drop('interactions', axis=1)
        dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32}
        if self.correction_factors is not None and pApplyCorrection:
            dtype_pixel['weight'] = np.float32

            # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight'
            # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix
            if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/':

                log.debug('h5 true')
                self.correction_factors = np.array(self.correction_factors).flatten()
                self.correction_factors = 1 / self.correction_factors
                mask = np.isnan(self.correction_factors)
                self.correction_factors[mask] = 0
                mask = np.isinf(self.correction_factors)
                self.correction_factors[mask] = 0
                self.correctionOperator = '*'
                log.debug('inverted correction factors')
            weight = convertNansToOnes(np.array(self.correction_factors).flatten())
            bins_data_frame = bins_data_frame.assign(weight=weight)

            log.debug("Reverting correction factors on matrix...")
            instances, features = self.matrix.nonzero()
            self.correction_factors = np.array(self.correction_factors)

            # do not apply if correction factors are just 1's
            instances_factors = self.correction_factors[instances]
            features_factors = self.correction_factors[features]

            instances_factors *= features_factors

            self.matrix.data = self.matrix.data.astype(float)

            # Apply the invert operation to get the original data
            if self.correctionOperator == '*' or self.correctionOperator is None:
                self.matrix.data /= instances_factors

            instances_factors = None
            features_factors = None

            self.matrix.eliminate_zeros()

        if self.correction_factors is not None and pApplyCorrection is False:
            dtype_pixel['weight'] = np.float32
            weight = convertNansToOnes(np.array(self.correction_factors).flatten())
            bins_data_frame = bins_data_frame.assign(weight=weight)

        instances, features = self.matrix.nonzero()

        matrix_data_frame = pd.DataFrame(instances, columns=['bin1_id'], dtype=np.int32)
        del instances
        matrix_data_frame = matrix_data_frame.assign(bin2_id=features)
        del features

        if self.enforceInteger:
            dtype_pixel['count'] = np.int32
            data = np.rint(self.matrix.data)
            matrix_data_frame = matrix_data_frame.assign(count=data)
        else:
            matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data)

        if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]:
            log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype))
            dtype_pixel['count'] = self.matrix.dtype
        split_factor = 1
        if len(self.matrix.data) > 1e7:
            split_factor = 1e4
            matrix_data_frame = np.array_split(matrix_data_frame, split_factor)

        if self.appendData:
            self.appendData = 'a'
        else:
            self.appendData = 'w'

        info = {}
        # these fields are created by cooler lib. Can cause errors if not deleted.
        if 'metadata' in info:
            if self.hic_metadata is None:
                self.hic_metadata = info['metadata']
            del info['metadata']
        if 'bin-size' in info:
            del info['bin-size']
        if 'bin-type' in info:
            del info['bin-type']

        info['format'] = str('HDF5::Cooler')
        info['format-url'] = str('https://github.com/mirnylab/cooler')
        info['generated-by'] = str('HiCMatrix-' + __version__)
        info['generated-by-cooler-lib'] = str('cooler-' + cooler.__version__)

        info['tool-url'] = str('https://github.com/deeptools/HiCMatrix')

        # info['nchroms'] = int(bins_data_frame['chrom'][:].nunique())
        # info['chromosomes'] = list(bins_data_frame['chrom'][:].unique())
        # info['nnz'] = np.string_(str(self.matrix.nnz * 2))
        # info['min-value'] = np.string_(str(matrix_data_frame['count'].min()))
        # info['max-value'] = np.string_(str(matrix_data_frame['count'].max()))
        # info['sum-elements'] = int(matrix_data_frame['count'].sum())

        if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata:
            info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by'])
            del self.hic_metadata['matrix-generated-by']
        if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata:
            info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url'])
            del self.hic_metadata['matrix-generated-by-url']
        if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata:
            info['genome-assembly'] = str(self.hic_metadata['genome-assembly'])
            del self.hic_metadata['genome-assembly']

        local_temp_dir = os.path.dirname(os.path.realpath(pFileName))
        cooler.create_cooler(cool_uri=pFileName,
                             bins=bins_data_frame,
                             pixels=matrix_data_frame,
                             mode=self.appendData,
                             dtypes=dtype_pixel,
                             ordered=True,
                             metadata=self.hic_metadata,
                             temp_dir=local_temp_dir)

        if self.appendData == 'w':
            fileName = pFileName.split('::')[0]
            with h5py.File(fileName, 'r+') as h5file:
                h5file.attrs.update(info)
                h5file.close()
コード例 #17
0
ファイル: utilities.py プロジェクト: XiaoTaoWang/HiCPeaks
def create_from_unordered(cool_uri, bins, chunks, columns=None, dtypes=None, mergebuf=int(20e6),
                         delete_temp=True, temp_dir=None, **kwargs):
    """
    Create a Cooler in two passes via an external sort mechanism. In the first 
    pass, a sequence of data chunks are processed and sorted in memory and saved
    to temporary Coolers. In the second pass, the temporary Coolers are merged 
    into the output. This way the individual chunks do not need to be provided
    in any particular order.
    
    Parameters
    ----------
    cool_uri : str
        Path to Cooler file or URI to Cooler group. If the file does not exist,
        it will be created.
    bins : DataFrame
        Segmentation of the chromosomes into genomic bins. May contain 
        additional columns.
    chunks : iterable of DataFrames
        Sequence of chunks that get processed and written to separate Coolers 
        and then subsequently merged.
    columns : sequence of str, optional
        Specify here the names of any additional value columns from the input 
        besides 'count' to store in the Cooler. The standard columns ['bin1_id', 
        'bin2_id', 'count'] can be provided, but are already assumed and don't 
        need to be given explicitly. Additional value columns provided here will 
        be stored as np.float64 unless otherwised specified using `dtype`.
    dtypes : dict, optional
        Dictionary mapping column names to dtypes. Can be used to override the
        default dtypes of ``bin1_id``, ``bin2_id`` or ``count`` or assign
        dtypes to custom value columns. Non-standard value columns given in
        ``dtypes`` must also be provided in the ``columns`` argument or they
        will be ignored.
    assembly : str, optional
        Name of genome assembly.
    mode : {'w' , 'a'}, optional [default: 'w']
        Write mode for the output file. 'a': if the output file exists, append
        the new cooler to it. 'w': if the output file exists, it will be
        truncated. Default is 'w'.
    metadata : dict, optional
        Experiment metadata to store in the file. Must be JSON compatible.
    mergebuf : int, optional
        Maximum number of records to buffer in memory at any give time during 
        the merge step.
    delete_temp : bool, optional
        Whether to delete temporary files when finished. 
        Useful for debugging. Default is False.
    temp_dir : str, optional
        Create temporary files in this directory.

    See also
    --------
    sanitize_records
    sanitize_pixels

    """
    bins = bins.copy()
    bins['chrom'] = bins['chrom'].astype(object)

    tf = tempfile.NamedTemporaryFile(
                suffix='.multi.cool', 
                delete=delete_temp,
                dir=temp_dir)
        
    uris = []
    for i, chunk in enumerate(chunks):
        uri = tf.name + '::' + str(i)
        uris.append(uri)
        log.info('Writing chunk {}: {}'.format(i, uri))
        create_cooler(uri, bins, chunk, columns=columns, mode='a', boundscheck=False,
                      triucheck=False, dupcheck=False, ensure_sorted=False, ordered=True,
                      dtypes=dtypes)
        
    chunks = CoolerMerger([Cooler(uri) for uri in uris], mergebuf)

    log.info('Merging into {}'.format(cool_uri))
    create_cooler(cool_uri, bins, chunks, columns=columns, dtypes=dtypes, ordered=True,
                  **kwargs)
コード例 #18
0
ファイル: utilities.py プロジェクト: XiaoTaoWang/HiCPeaks
    def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#','X'], onlyIntra=True,
        dtype='int'):

        self.outfil = os.path.abspath(os.path.expanduser(outfil))
        if os.path.exists(self.outfil):
            log.error('Cooler file {} already exists, exit ...'.format(self.outfil))
            sys.exit(1)
        self.chroms = set(chroms)
        self.onlyIntra = onlyIntra
        data = datasets

        ## Ready for data loading
        if not chromsizes_file is None:
            chromsizes_path = os.path.abspath(os.path.expanduser(chromsizes_file))
            log.info('Read chromosome sizes from {}'.format(chromsizes_path))
            chromsizes = readChromSizes(chromsizes_path, self.chroms)
        else:
            log.info('Fetch chromosome sizes from UCSC ...')
            chromsizes = fetchChromSizes(assembly, self.chroms)
        chromlist = chromsizes.keys()
        # sort chromosome labels
        tmp = list(map(str, sorted(map(int, [i for i in chromlist if i.isdigit()]))))
        nondigits = [i for i in chromlist if not i.isdigit()]
        for i in ['X','Y','M']:
            if i in nondigits:
                tmp.append(nondigits.pop(nondigits.index(i)))
        chromlist = tmp + sorted(nondigits)
        lengths = [chromsizes[i] for i in chromlist]
        self.chromsizes = pd.Series(data=lengths, index=chromlist)
        log.info('Done')

        ## We don't read data into memory at this point.
        ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels.
        self.Map = {}
        for res in data:
            if data[res].endswith('.npz'):
                self.Map[res] = {}
                lib = np.load(data[res])
                for i in lib.files:
                    if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)):
                        # Compatible with TADLib and old version of runHiC
                        c1 = c2 = i
                        self.Map[res][(c1,c2)] = lib
                    else:
                        tmp = i.split('_')
                        if len(tmp)!=2:
                            continue
                        c1, c2 = tmp
                        check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms))
                        check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms))
                        if check1 and check2:
                            self.Map[res][(c1,c2)] = lib
            else:
                self.Map[res] = self._scanFolder(data[res])

        self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'],
                                    'formats':[np.int, np.int, np.float]})
        
        log.info('Extract and save data into cooler format for each resolution ...')
        for res in self.Map:
            log.info('Current resolution: {}bp'.format(res))
            byres = self.Map[res]
            # Extract parts of chromsizes
            subset = []
            for c1, c2 in byres:
                subset.extend([c1,c2])
            subset = set(subset)
            Bool = [(i in subset) for i in self.chromsizes.index]
            chromsizes = self.chromsizes[Bool]
            bin_cumnums = self.binCount(chromsizes, res)
            log.info('Generate bin table ...')
            bintable = binnify(chromsizes, res)
            pixels = self._generator(byres, chromsizes, bin_cumnums)
            if os.path.exists(self.outfil):
                mode = 'a'
            else:
                mode = 'w'
            if dtype == 'int':
                dtypes = {'count': np.int32}
            else:
                dtypes = {'count': np.float64}
            cooler_uri = '{}::{}'.format(self.outfil, res)
            if self.onlyIntra:
                create_cooler(cooler_uri, bintable, pixels, assembly=assembly, mode=mode,
                       boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False,
                       ordered=True, metadata={'onlyIntra':str(self.onlyIntra)}, dtypes=dtypes)
            else:
                create_from_unordered(cooler_uri, bintable, pixels, assembly=assembly,
                                      mode=mode, metadata={'onlyIntra':str(self.onlyIntra)},
                                      delete_temp=True, boundscheck=False, triucheck=False,
                                      dupcheck=False, ensure_sorted=False, dtypes=dtypes)
コード例 #19
0
def test_dump():
    runner = CliRunner()
    with runner.isolated_filesystem():
        f_in = op.join(datadir, "toy.symm.upper.2.cool")
        result = runner.invoke(dump, [f_in])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "-t", "chroms", "--columns", "length"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "-t", "bins", "--columns", "chrom,start"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "-r", "chr1"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "-r", "chr1:0-16", "-r2", "chr1:10-25"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "-r", "chr1:10-25", "-r2", "chr1:0-5"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "--join"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "--join", "--one-based-ids"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "--join", "--one-based-starts"])
        assert result.exit_code == 0
        result = runner.invoke(dump, [f_in, "--annotate", "chrom", "--one-based-starts"])
        assert result.exit_code == 0

        # unbalanced file
        result = runner.invoke(dump, [f_in, "-b"])
        assert result.exit_code == 1

        # roundtrip symm-upper data
        result = runner.invoke(dump, [f_in, "-H", "-t", "bins"])
        bins = pd.read_csv(StringIO(result.output), sep="\t")
        result = runner.invoke(dump, [f_in, "-H"])
        pixels = pd.read_csv(StringIO(result.output), sep="\t")
        cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=True)
        cooler_cmp(f_in, "out.cool")

        # duplexed output
        result = runner.invoke(dump, [f_in, "--matrix", "-H"])
        pixels2 = pd.read_csv(StringIO(result.output), sep="\t")
        assert len(pixels2) > len(pixels)
        upper = pixels2[pixels2["bin1_id"] <= pixels2["bin2_id"]].reset_index(drop=True)
        assert np.allclose(pixels, upper)

        # lower triangle
        result = runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"])
        trans_lower = pd.read_csv(StringIO(result.output), sep="\t")
        assert len(trans_lower) == 0
        result = runner.invoke(dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"])
        trans_lower = pd.read_csv(StringIO(result.output), sep="\t")
        assert len(trans_lower) > 0

        # roundtrip square data
        f_in = op.join(datadir, "toy.asymm.2.cool")
        result = runner.invoke(dump, [f_in, "-H", "-t", "bins"])
        bins = pd.read_csv(StringIO(result.output), sep="\t")
        result = runner.invoke(dump, [f_in, "-H"])
        pixels = pd.read_csv(StringIO(result.output), sep="\t")
        cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=False)
        cooler_cmp(f_in, "out.cool")
        result = runner.invoke(dump, [f_in, "--matrix", "-H"])
        pixels2 = pd.read_csv(StringIO(result.output), sep="\t")
        assert np.allclose(pixels, pixels2)

        # for square data, -m is a no-op
        result = runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"])
        lower1 = pd.read_csv(StringIO(result.output), sep="\t")
        result = runner.invoke(dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"])
        lower2 = pd.read_csv(StringIO(result.output), sep="\t")
        assert np.allclose(lower1, lower2)
コード例 #20
0
def to_cooler(hic,
              path,
              balance=True,
              multires=True,
              resolutions=None,
              n_zooms=10,
              threads=1,
              chunksize=100000,
              max_resolution=5000000,
              natural_order=True,
              chromosomes=None,
              **kwargs):
    """
    Export Hi-C data as Cooler file.

    Only contacts that have not been
    filtered are exported. https://github.com/mirnylab/cooler/

    Single resolution files:
    If input Hi-C matrix is uncorrected, the uncorrected matrix is stored.
    If it is corrected, the uncorrected matrix is stored along with bias vector.
    Cooler always calculates corrected matrix on-the-fly from the uncorrected
    matrix and the bias vector.

    Multi-resolution files (default):


    :param hic: Hi-C file in any compatible (RegionMatrixContainer) format
    :param path: Output path for cooler file
    :param balance: Include bias vector in cooler output (single res) or perform
                    iterative correction (multi res)
    :param multires: Generate a multi-resolution cooler file
    :param resolutions: Resolutions in bp (int) for multi-resolution cooler output
    :param chunksize: Number of pixels processed at a time in cooler
    :param kwargs: Additional arguments passed to cooler.iterative_correction
    """
    base_resolution = hic.bin_size

    tmp_files = []
    try:
        if multires:
            if resolutions is None:
                resolutions = [
                    base_resolution * 2**i for i in range(n_zooms)
                    if base_resolution * 2**i < max_resolution
                ]
            else:
                for r in resolutions:
                    if r % base_resolution != 0:
                        raise ValueError("Resolution {} must be a multiple of "
                                         "base resolution {}!".format(
                                             r, base_resolution))

            single_path = tempfile.NamedTemporaryFile(delete=False,
                                                      suffix='.cool').name
            tmp_files.append(single_path)
            multi_path = path
        else:
            single_path = path
            multi_path = None

        natural_key = cmp_to_key(natural_cmp)
        if chromosomes is None:
            chromosomes = hic.chromosomes()
            if natural_order:
                chromosomes = sorted(
                    chromosomes, key=lambda x: natural_key(x.encode('utf-8')))

        logger.info("Loading genomic regions")
        ix_converter = dict()
        regions = []
        region_order = []
        new_region_index = 0
        for chromosome in chromosomes:
            for region in hic.regions(chromosome, lazy=True):
                regions.append(
                    (region.chromosome, region.start - 1, region.end))
                ix_converter[region.ix] = new_region_index
                region_order.append(region.ix)
                new_region_index += 1
        region_df = pandas.DataFrame(regions,
                                     columns=['chrom', 'start', 'end'])

        def pixel_iter():
            for chri in range(len(chromosomes)):
                chromosome1 = chromosomes[chri]
                for chrj in range(chri, len(chromosomes)):
                    chromosome2 = chromosomes[chrj]

                    logger.info("{} - {}".format(chromosome1, chromosome2))

                    def chromosome_pixel_iter():
                        for edge in hic.edges((chromosome1, chromosome2),
                                              norm=False,
                                              lazy=True):
                            source, sink = ix_converter[
                                edge.source], ix_converter[edge.sink]
                            if sink < source:
                                source, sink = sink, source
                            yield source, sink, edge.weight

                    pixels = np.fromiter(chromosome_pixel_iter(),
                                         dtype=[("bin1_id", np.int_),
                                                ("bin2_id", np.int_),
                                                ("count", np.float_)])
                    pixels = np.sort(pixels, order=("bin1_id", "bin2_id"))
                    if len(pixels) > 0:
                        yield pandas.DataFrame(pixels)

        logger.info("Writing cooler")
        cooler.create_cooler(cool_uri=single_path,
                             bins=region_df,
                             pixels=pixel_iter(),
                             ordered=False)

        cool_path, group_path = cooler.util.parse_cooler_uri(single_path)

        if not multires:
            if balance:
                logger.info("Writing bias vector from FAN-C matrix")
                bias = hic.bias_vector()[np.array(region_order)]

                # Copied this section from
                # https://github.com/mirnylab/cooler/blob/356a89f6a62e2565f42ff13ec103352f20d251be/cooler/cli/balance.py#L195
                with h5py.File(cool_path, 'r+') as h5:
                    grp = h5[group_path]
                    # add the bias column to the file
                    h5opts = dict(compression='gzip', compression_opts=6)
                    grp['bins'].create_dataset("weight", data=bias, **h5opts)
            return CoolerHic(single_path)
        else:
            cooler.zoomify_cooler(single_path,
                                  multi_path,
                                  resolutions,
                                  chunksize,
                                  nproc=threads)
            if balance:
                logger.info("Balancing zoom resolutions...")
                for resolution in resolutions:
                    uri = multi_path + "::resolutions/" + str(resolution)
                    cool_path, group_path = cooler.util.parse_cooler_uri(uri)
                    cool = cooler.Cooler(uri)
                    bias, stats = cooler.balance_cooler(cool,
                                                        chunksize=chunksize,
                                                        **kwargs)
                    with h5py.File(cool_path, 'r+') as h5:
                        grp = h5[group_path]
                        # add the bias column to the file
                        h5opts = dict(compression='gzip', compression_opts=6)
                        grp['bins'].create_dataset("weight",
                                                   data=bias,
                                                   **h5opts)
                        grp['bins']['weight'].attrs.update(stats)
            return CoolerHic(multi_path +
                             '::resolutions/{}'.format(base_resolution))
    finally:
        for tmp_file in tmp_files:
            os.remove(tmp_file)
コード例 #21
0
def pixel_iter():
    chr_bin = 0
    for ch_no in range(1, 23):
        ch = f"chr{ch_no}"
        print(ch, chr_bin)
        with open(f"Hippo_{ch}") as hic_file:
            counts = {"bin1_id": list(), "bin2_id": list(), "count": list()}
            i = 0
            for line in hic_file:
                if line.strip():
                    row = line.split("\t")
                    for j, cnt in enumerate(row):
                        if j >= i and cnt.strip() != "":
                            counts["bin1_id"].append(chr_bin + i)
                            counts["bin2_id"].append(chr_bin + j)
                            counts["count"].append(int(cnt))
                    i += 1
            yield DataFrame(data=counts, copy=True)
            chr_bin += i + 1


bin_size = 40000

bins = read_bins(bin_size)
pixels = pixel_iter()

cooler.create_cooler(f"hippo.mcool::resolutions/{bin_size}",
                     bins,
                     pixels,
                     ordered=True)
コード例 #22
0
ファイル: test_cli.py プロジェクト: Phlya/cooler
def test_dump():
    runner = CliRunner()
    with runner.isolated_filesystem():
        f_in = op.join(datadir, "toy.symm.upper.2.cool")
        result = runner.invoke(dump, [f_in])
        assert result.exit_code == 0

        # roundtrip symm-upper data
        bins = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "-H", "-t", "bins"]).output), sep="\t"
        )
        pixels = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "-H"]).output), sep="\t"
        )
        cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=True)
        cooler_cmp(f_in, "out.cool")

        # duplexed output
        pixels2 = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "--matrix", "-H"]).output), sep="\t"
        )
        assert len(pixels2) > len(pixels)
        upper = pixels2[pixels2["bin1_id"] <= pixels2["bin2_id"]].reset_index(drop=True)
        assert np.allclose(pixels, upper)

        # lower triangle
        trans_lower = pd.read_csv(
            StringIO(
                runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]).output
            ),
            sep="\t",
        )
        assert len(trans_lower) == 0
        trans_lower = pd.read_csv(
            StringIO(
                runner.invoke(
                    dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"]
                ).output
            ),
            sep="\t",
        )
        assert len(trans_lower) > 0

        # roundtrip square data
        f_in = op.join(datadir, "toy.asymm.2.cool")
        bins = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "-H", "-t", "bins"]).output), sep="\t"
        )
        pixels = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "-H"]).output), sep="\t"
        )
        cooler.create_cooler("out.cool", bins, pixels, symmetric_upper=False)
        cooler_cmp(f_in, "out.cool")
        pixels2 = pd.read_csv(
            StringIO(runner.invoke(dump, [f_in, "--matrix", "-H"]).output), sep="\t"
        )
        assert np.allclose(pixels, pixels2)

        # for square data, -m is a no-op
        lower1 = pd.read_csv(
            StringIO(
                runner.invoke(dump, [f_in, "-H", "-r", "chr2", "-r2", "chr1"]).output
            ),
            sep="\t",
        )
        lower2 = pd.read_csv(
            StringIO(
                runner.invoke(
                    dump, [f_in, "-m", "-H", "-r", "chr2", "-r2", "chr1"]
                ).output
            ),
            sep="\t",
        )
        assert np.allclose(lower1, lower2)
コード例 #23
0
ファイル: contacts.py プロジェクト: nanoporetech/pore-c
def export_to_cooler(
    contact_table,
    output_prefix,
    cooler_resolution,
    fragment_table,
    chromsizes,
    query,
    query_columns=None,
    by_haplotype=False,
):

    results = []
    if query_columns:
        columns = query_columns[:]
    else:
        columns = []
    columns.extend(["align1_fragment_id", "align2_fragment_id"])
    if by_haplotype:
        columns.extend(["align1_haplotype", "align2_haplotype"])
    contact_df = dd.read_parquet(contact_table,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=columns,
                                 index=False)
    if query:
        contact_df = contact_df.query(query)

    chrom_dict = pd.read_csv(chromsizes,
                             sep="\t",
                             header=None,
                             names=["chrom", "size"],
                             index_col=["chrom"],
                             squeeze=True)
    # create even-widht bins using cooler
    bins_df = binnify(chrom_dict, cooler_resolution)
    bins_df.index.name = "bin_id"
    # convert to ranges for overlap
    bins = pr.PyRanges(bins_df.reset_index().rename(columns={
        "start": "Start",
        "end": "End",
        "chrom": "Chromosome"
    }))

    fragment_df = dd.read_parquet(fragment_table,
                                  engine=PQ_ENGINE,
                                  version=PQ_VERSION).compute()
    midpoint_df = pr.PyRanges(
        fragment_df.reset_index()[[
            "chrom", "start", "end", "fragment_id"
        ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype(
            int)).eval("end = start + 1").rename(columns={
                "chrom": "Chromosome",
                "start": "Start",
                "end": "End"
            }))
    # use a pyranges joing to assign fragments to bins
    fragment_to_bin = midpoint_df.join(
        bins, how="left").df[["fragment_id", "bin_id"]]
    fragment_to_bin = fragment_to_bin.set_index(
        "fragment_id").sort_index()  # .astype(np.uint32)
    nulls = fragment_to_bin["bin_id"] == -1
    if nulls.any():
        logger.warning(
            "Some fragments did not overlap bins, removing from analysis:\n{}".
            format(fragment_to_bin[nulls].join(fragment_df)))
        fragment_to_bin = fragment_to_bin[~nulls]

    # use a join to assign each end of a contact to a bin
    binned_contacts = (contact_df.merge(
        fragment_to_bin,
        how="inner",
        right_index=True,
        left_on="align1_fragment_id").merge(
            fragment_to_bin,
            how="inner",
            right_index=True,
            left_on="align2_fragment_id",
            suffixes=[None, "_2"]).rename(columns={
                "bin_id": "bin1_id",
                "bin_id_2": "bin2_id"
            }))

    if not by_haplotype:
        cooler_path = output_prefix + ".cool"
        # group size == number of contacts per bin_pair
        pixels = binned_contacts.groupby(
            ["bin1_id",
             "bin2_id"]).size().rename("count").astype(np.int32).reset_index()
        create_cooler(cooler_path,
                      bins_df,
                      pixels,
                      ordered=True,
                      symmetric_upper=True,
                      ensure_sorted=True)
        c = Cooler(cooler_path)
        logger.info(f"Created cooler: {c.info}")
        results.append(cooler_path)
    else:
        tmp_parquet = output_prefix + ".tmp.pq"
        pixels = (
            # create a key to groupy by haplotype pair, order of haplotypes doesn't matter
            binned_contacts.assign(
                hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"]
                                    ].apply(lambda y: "{}_{}".format(*sorted(
                                        y)).replace("-1", "nohap"),
                                            axis=1,
                                            meta="object")
            ).groupby(["hap_key", "bin1_id",
                       "bin2_id"]).size().rename("count").astype(
                           np.int32
                       ).reset_index().astype({"hap_key": "category"}))

        # save to a temporary parquet file, this might not be necessary
        # but want to avoid the whole contact matrix hitting memory
        pixels.to_parquet(
            tmp_parquet,
            write_metadata_file=True,
            partition_on=["hap_key"],
            write_index=False,
            engine=PQ_ENGINE,
            version=PQ_VERSION,
        )

        pixels = dd.read_parquet(tmp_parquet,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=["hap_key"],
                                 index=False)
        hap_keys = pixels["hap_key"].unique().compute()
        # create a cooler for each haplotype pair
        for hap_key in hap_keys:
            cooler_path = f"{output_prefix}.{hap_key}.cool"
            pixels = dd.read_parquet(
                tmp_parquet,
                filters=[("hap_key", "==", hap_key)],
                index=False,
                engine=PQ_ENGINE,
                version=PQ_VERSION,
                columns=["bin1_id", "bin2_id", "count"],
            )
            create_cooler(cooler_path,
                          bins_df,
                          pixels,
                          ordered=True,
                          symmetric_upper=True,
                          ensure_sorted=True)
            c = Cooler(cooler_path)
            logger.info(f"Created cooler: {c.info}")
            results.append(cooler_path)

        shutil.rmtree(tmp_parquet)

    return results
コード例 #24
0
ファイル: down-sample.py プロジェクト: nedo0shki/HiCPlus-PC
    vec_of_prob = [p / total_num_reads for p in vec_of_prob]
    print("start of sampling...")
    down_sampled_counts = np.random.multinomial(num_sample_reads, vec_of_prob)
    print("sampling finished!")
    if not os.path.exists(args['output_folder_path']):
        os.makedirs(args['output_folder_path'])
    start_ind = 0
    for chr_file in chr_files_list:
        chr_data = pd.read_csv(os.path.join(COO_folder_path, chr_file),
                               delimiter="\t",
                               header=None)
        pixel_size = chr_data.shape[0]
        new_pixel = np.column_stack(
            (chr_data.iloc[:, 0], chr_data.iloc[:, 1],
             down_sampled_counts[start_ind:start_ind + pixel_size]))
        start_ind = start_ind + pixel_size
        np.savetxt(os.path.join(args['output_folder_path'], chr_file),
                   new_pixel,
                   delimiter="\t",
                   fmt="%i")
        print(chr_file + " is done!")
"""
Obs1: when we fetch a specific chromosome it means first columns belong to regions in that chromosome but
second column regions are through whole genome
Obs2: reads are not considered twice in files, for example when we fetch chr2 pixels, there are not interactions
between chr2 and chr1 any more.
Obs3: number of intra reads: 125015861, whole reads: 153752070 (in low resolution sample)
new_bins = high_res_cool.bins()
cooler.create_cooler(cool_uri = "/Users/neda/prostate-samples/PCa13266.down-sample.cool", bins = new_bins, pixels = new_pixel)
"""
コード例 #25
0
def create_cooler_cc(
    output_prefix: str,
    bins: pd.DataFrame,
    pixels: pd.DataFrame,
    capture_name: str,
    capture_oligos: os.PathLike,
    capture_bins: Union[int, list] = None,
    suffix=None,
    **cooler_kwargs,
) -> os.PathLike:
    """
    Creates a cooler hdf5 file or cooler formatted group within a hdf5 file.

    Args:
     output_prefix (str): Output path for hdf5 file. If this already exists, will append a new group to the file.
     bins (pd.DataFrame): DataFrame containing the genomic coordinates of all bins in the pixels table.
     pixels (pd.DataFrame): DataFrame with columns: bin1_id, bin2_id, count.
     capture_name (str): Name of capture probe to store.
     capture_oligos (os.PathLike): Path to capture oligos used for the analysis.
     capture_bins (Union[int, list], optional): Bins containing capture oligos. Can be determined from oligos if not supplied. Defaults to None.
     suffix (str, optional): Suffix to append before the .hdf5 file extension. Defaults to None.

    Raises:
     ValueError: Capture name must exactly match the name of a supplied capture oligo.

    Returns:
     os.PathLike: Path of cooler hdf5 file.
    """

    # Gets capture coordinates
    capture_coords = get_capture_coords(capture_oligos, capture_name)

    # Make sure capture coordinates are returned correctly, if not, error.
    if capture_coords is None:
        raise ValueError(f"Incorrect capture name specified: {capture_name}.")

    # If capture bins not provided get them using the coordinates.
    if not capture_bins:
        capture_bins = get_capture_bins(
            bins,
            capture_coords["chrom"],
            capture_coords["start"],
            capture_coords["end"],
        )
        capture_bins = [int(x) for x in capture_bins]

    # Need to store bins as a list so make sure its not just a single int.
    elif isinstance(capture_bins, int):
        capture_bins = [
            int(capture_bins),
        ]

    # The cooler.create_cooler function will not accept np.arrays so must convert to python list
    elif isinstance(capture_bins, (np.array, pd.Series)):
        capture_bins = [int(x) for x in capture_bins]

    # Get the number of cis interactions, required for normalisation.
    bins_cis = bins.query(f'chrom == "{capture_coords["chrom"]}"')["name"]
    pixels_cis = pixels.loc[lambda df: (df["bin1_id"].isin(bins_cis)) |
                            (df["bin2_id"].isin(bins_cis))]
    n_cis_interactions = pixels_cis["count"].sum()

    # Metadata for cooler file.
    metadata = {
        "capture_bins": capture_bins,
        "capture_name": capture_name,
        "capture_chrom": capture_coords['chrom'],
        "capture_coords":
        f'{capture_coords["chrom"]}:{capture_coords["start"]}-{capture_coords["end"]}',
        "n_cis_interactions": int(n_cis_interactions),
    }

    if os.path.exists(output_prefix
                      ):  # Will append to a prexisting file if one is supplied
        append_to_file = True
        cooler_fn = f"{output_prefix}::/{capture_name}"
    else:
        append_to_file = False
        cooler_fn = f"{output_prefix.replace('.hdf5', '')}.{capture_name}{'.' + suffix if suffix else ''}.hdf5"

    cooler.create_cooler(
        cooler_fn,
        bins=bins,
        pixels=pixels,
        metadata=metadata,
        mode="w" if not append_to_file else "a",
        **cooler_kwargs,
    )

    return cooler_fn
コード例 #26
0
def createFakeMatrices(outfile, peakpos, peakwidth, length, resolution, count,
                       chromosome):
    errorMsg = ""
    if not outfile.endswith('.cool'):
        errorMsg += "Matrix output file must be in cooler format. Aborting\n"
    if peakwidth > length / 2:
        errorMsg += "peak width must not be more than half the peak length\n"
    if peakpos - peakwidth / 2 < 0 or peakpos + peakwidth / 2 > length:
        errorMsg += "Peak is not fully inside the range (0...length). Reduce peak width or adjust peak position\n"
    if errorMsg != "":
        sys.exit(errorMsg)

    adjustedLength = length - length % resolution
    binStartList = list(range(0, adjustedLength, resolution))
    binEndList = list(range(resolution, adjustedLength, resolution))
    binEndList.append(adjustedLength)
    if len(binStartList) != len(binEndList):
        errorMsg = "bug while creating bins. Start and end bin lists not equally long"
        sys.exit(errorMsg)
    bins = pd.DataFrame(columns=['chrom', 'start', 'end'])
    bins['start'] = binStartList
    bins['end'] = binEndList
    bins['chrom'] = chromosome

    bin1List = []
    bin2List = []
    for bin1Id in range(len(binStartList)):
        for bin2Id in range(len(binStartList)):
            bin1List.append(bin1Id)
            bin2List.append(bin2Id)

    pixels = pd.DataFrame(columns=['bin1_id', 'bin2_id', 'count'])
    pixels['bin1_id'] = bin1List
    pixels['bin2_id'] = bin2List
    pixels['count'] = 0

    adjustedPeakWidth = peakwidth - peakwidth % resolution
    peakStartBin = int((peakpos - adjustedPeakWidth / 2) / resolution)
    peakEndBin = peakStartBin + int(adjustedPeakWidth / resolution)
    m1 = pixels['bin1_id'] >= peakStartBin
    m2 = pixels['bin1_id'] < peakEndBin
    m3 = pixels['bin2_id'] >= peakStartBin
    m4 = pixels['bin2_id'] < peakEndBin
    mask = m1 & m2 & m3 & m4
    pixels.loc[mask, 'count'] = count
    pixels.sort_values(by=['bin1_id', 'bin2_id'], inplace=True)

    #assert that the resulting matrix is symmetric
    matIdx = (list(pixels['bin1_id']), list(pixels['bin2_id']))
    data = list(pixels['count'])
    mtrx = sparse.csr_matrix((data, matIdx)).todense()
    symmetric = np.allclose(mtrx, mtrx.T, rtol=1e-20, atol=1e-20)
    if not symmetric:
        errorMsg = 'bug: resulting matrix should be symmetric, but is not'
        sys.exit(errorMsg)

    cooler.create_cooler(outfile,
                         bins=bins,
                         pixels=pixels,
                         triucheck=False,
                         symmetric_upper=False)
コード例 #27
0
def writeCooler(pMatrixList, pBinSizeInt, pOutfile, pChromosomeList, pChromSizeList=None,  pMetadata=None):
    #takes a matrix as numpy array or sparse matrix and writes a cooler matrix from it
    #modified from study project such that multiple chroms can be written to a single matrix

    def pixelGenerator(pMatrixList, pOffsetList):
        '''
        yields pixel dataframes per Matrix
        Parameters:
        pMatrixList: list of matrices as np.ndarray or sparse.csr_matrix
        pOffsetList: list of integers that specify the offset into the bins dataframe

        Yields:
        pixels: pixels dataframe for all Hi-C matrices in the input list
        '''
        for matrix, offset in zip(pMatrixList, pOffsetList):
            #create the pixels for cooler
            triu_Indices = np.triu_indices(matrix.shape[0])
            pixels_tmp = pd.DataFrame(columns=['bin1_id','bin2_id','count'])
            pixels_tmp['bin1_id'] = (triu_Indices[0] + offset).astype("uint32")
            pixels_tmp['bin2_id'] = (triu_Indices[1] + offset).astype("uint32")
            readCounts = matrix[triu_Indices]
            if sparse.isspmatrix_csr(matrix): #for sparse matrices, slicing is different
                readCounts = np.transpose(readCounts)
            pixels_tmp['count'] = np.float64(readCounts)
            pixels_tmp.sort_values(by=['bin1_id','bin2_id'],inplace=True)
            yield pixels_tmp

    if pMatrixList is None or pChromosomeList is None or pBinSizeInt is None or pOutfile is None:
        msg = "input empty. No cooler matrix written"
        print(msg)
        return
    if len(pMatrixList) != len(pChromosomeList):
        msg = "number of input arrays and chromosomes must be the same"
        print(msg)
        return
    if pChromSizeList is not None and len(pChromSizeList) != len(pChromosomeList):
        msg = "if chrom sizes are given, they must be provided for ALL chromosomes"
        print(msg)
        return
    bins = pd.DataFrame(columns=['chrom','start','end'])
    
    offsetList = [0]
    for i, (matrix, chrom) in enumerate(zip(pMatrixList,pChromosomeList)):
        #the chromosome size may not be integer-divisible by the bin size
        #so specifying the real chrom size is possible, but the
        #number of bins must still correspond to the matrix size
        chromSizeInt = int(matrix.shape[0] * pBinSizeInt)
        if pChromSizeList is not None \
                and pChromSizeList[i] is not None \
                and pChromSizeList[i] > (chromSizeInt - pBinSizeInt)\
                and pChromSizeList[i] < chromSizeInt:
            chromSizeInt = int(pChromSizeList[0])

        #create the bins for cooler
        bins_tmp = pd.DataFrame(columns=['chrom','start','end'])
        binStartList = list(range(0, chromSizeInt, int(pBinSizeInt)))
        binEndList = list(range(int(pBinSizeInt), chromSizeInt, int(pBinSizeInt)))
        binEndList.append(chromSizeInt)
        bins_tmp['start'] = np.uint32(binStartList)
        bins_tmp['end'] = np.uint32(binEndList)
        bins_tmp["chrom"] = str(chrom)
        bins = bins.append(bins_tmp, ignore_index=True)
        offsetList.append(offsetList[-1] + bins_tmp.shape[0])
    #correct dtypes for joint dataframe
    bins["start"] = bins["start"].astype("uint32")
    bins["end"] = bins["end"].astype("uint32")
    offsetList = offsetList[:-1] #don't need the last one, no more matrix to follow

    #write out the cooler
    cooler.create_cooler(pOutfile, bins=bins, pixels=pixelGenerator(pMatrixList=pMatrixList, pOffsetList=offsetList), dtypes={'count': np.float64}, ordered=True, metadata=pMetadata)
コード例 #28
0
ファイル: sample.py プロジェクト: zhang-jiankun/cooltools
def sample_cooler(
    clr,
    out_clr_path,
    count=None,
    frac=None,
    exact=False,
    map_func=map,
    chunksize=int(1e7),
):
    """
    Pick a random subset of contacts from a Hi-C map.

    Parameters
    ----------
    clr : cooler.Cooler or str
        A Cooler or a path/URI to a Cooler with input data.

    out_clr_path : str
        A path/URI to the output.

    count : float
        The target number of contacts in the sample.
        Mutually exclusive with `frac`.

    frac : float
        The target sample size as a fraction of contacts in the original
        dataset. Mutually exclusive with `count`.

    exact : bool
        If True, the resulting sample size will exactly match the target value.
        Exact sampling will load the whole pixel table into memory!
        If False, binomial sampling will be used instead and the sample size
        will be randomly distributed around the target value.

    map_func : function
        A map implementation.

    chunksize : int
        The number of pixels loaded and processed per step of computation.

    """
    if issubclass(type(clr), str):
        clr = cooler.Cooler(clr)

    if count is not None and frac is None:
        frac = count / clr.info["sum"]
    elif count is None and frac is not None:
        count = np.round(frac * clr.info["sum"])
    else:
        raise ValueError("Either frac or tot_count must be specified!")

    if frac >= 1.0:
        raise ValueError(
            "The number of contacts in a sample cannot exceed "
            "that in the original dataset."
        )

    if exact:
        pixels = sample_pixels_exact(clr.pixels()[:], count)
        cooler.create_cooler(out_clr_path, clr.bins()[:], pixels, ordered=True)

    else:
        pipeline = (
            cooler.tools.split(
                clr, include_bins=False, map=map_func, chunksize=chunksize
            )
            .pipe(_extract_pixel_chunk)
            .pipe(sample_pixels_approx, frac=frac)
        )

        cooler.create_cooler(out_clr_path, clr.bins()[:], iter(pipeline), ordered=True)
コード例 #29
0
def toCooler(pixels, bins, outfile):
    #check if the inputs are as expected
    try:
        df_pixels = pd.read_csv(pixels, sep="\t", index_col=False)
        df_bins = pd.read_csv(bins, sep="\t", index_col=False)
    except Exception as e:
        msg = str(e) + "\nCould not read infiles, wrong files/format etc.?"
        raise SystemExit(msg)
    pixels_columns = {"cbin1", "cbin2", "expected_count", "observed_count"}
    bins_columns = {"cbin", "chr", "from.coord", "to.coord", "count"}
    if len(pixels_columns.intersection(set(
            df_pixels.columns))) != len(pixels_columns):
        msg = "pixels: not the expected column names"
        raise SystemExit(msg)
    if len(bins_columns.intersection(set(
            df_bins.columns))) != len(bins_columns):
        msg = "bins: not the expected column names"
        raise SystemExit(msg)

    #prepare the pixels for cooler
    df_pixels.rename(columns={
        "cbin1": "bin1_id",
        "cbin2": "bin2_id",
        "observed_count": "count"
    },
                     inplace=True)
    df_pixels.drop(columns=["expected_count"], inplace=True)
    gt = df_pixels["bin1_id"] > df_pixels["bin2_id"]
    df_pixels = df_pixels[
        ~gt]  #drop duplicate entries, keep upper triangular part of matrix

    #prepare the bins for cooler
    df_bins.rename(columns={
        "chr": "chrom",
        "from.coord": "start",
        "to.coord": "end"
    },
                   inplace=True)
    df_bins.drop(columns=["count"], inplace=True)
    binsize = df_bins.iloc[0, :]["end"] - df_bins.iloc[0, :]["start"]
    chromnames = list(df_bins["chrom"].unique())
    print("chromnames:", chromnames)
    print("detected binsize:", binsize)
    #sometimes the last bin is present, but "end" does not point to chromsize, but max. size given by bin
    #need to set the max. chrom size in this case, otherwise the last bin will be duplicated, once with end==chromsize and once with end==maxbin*binsize
    for chrom in chromnames:
        max_allowed_val = int(
            np.ceil(DM3_CHROM_SIZES[chrom] / binsize) * binsize)
        chromfltr = df_bins["chrom"] == chrom
        max_given_val = df_bins[chromfltr]["end"].values[-1]
        if max_given_val == max_allowed_val:
            idx = df_bins.loc[chromfltr, "end"].index[-1]
            df_bins.loc[idx, "end"] = DM3_CHROM_SIZES[chrom]
            msg = "INFO: reset size of chromosome {:s} from {:d} to {:d}".format(
                chrom, max_given_val, DM3_CHROM_SIZES[chrom])
            print(msg)
        elif max_given_val < max_allowed_val:
            pass
        else:
            msg = "Chrom {:s} is larger than expected from ref. genome dm3.".format(
                chrom)
            raise SystemExit(msg)

    #The provided bins dataframe is sparse (missing bins at start, end, and in-between)
    #So create a new one, which contains all bins
    df_bins_cpl = pd.DataFrame()
    for chr in chromnames:
        chromsize = DM3_CHROM_SIZES[chr]
        start_list = [x for x in range(0, chromsize, binsize)]
        end_list = [x
                    for x in range(binsize, chromsize, binsize)] + [chromsize]
        df1 = pd.DataFrame()
        df1["start"] = start_list
        df1["end"] = end_list
        df1["chrom"] = chr
        df_bins_cpl = df_bins_cpl.append(df1, ignore_index=True)
    print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].tail())
    df_bins_cpl.reset_index(inplace=True, drop=True)
    #get the old indices into the cpl bins dataframe, need them to update the pixels dataframe later
    df_bins_cpl = df_bins_cpl.merge(df_bins,
                                    on=["start", "end", "chrom"],
                                    how="outer")
    df_bins_cpl["cbin"].fillna(-1, inplace=True)
    df_bins_cpl["cbin"] = df_bins_cpl["cbin"].astype("int64")
    df_bins_cpl.sort_values(by=["chrom", "start", "end"], inplace=True)
    df_bins_cpl.reset_index(inplace=True, drop=True)
    df_bins_cpl["new_index"] = df_bins_cpl.index
    #update the bin ids in pixels df
    df_pixels = df_pixels.merge(df_bins_cpl,
                                left_on="bin1_id",
                                right_on="cbin",
                                how="inner")
    df_pixels["bin1_id"] = df_pixels["new_index"]
    df_pixels.drop(columns=["new_index", "cbin", "start", "end", "chrom"],
                   inplace=True)
    df_pixels = df_pixels.merge(df_bins_cpl,
                                left_on="bin2_id",
                                right_on="cbin",
                                how="inner")
    df_pixels["bin2_id"] = df_pixels["new_index"]
    df_pixels.drop(columns=["new_index", "cbin", "start", "end", "chrom"],
                   inplace=True)

    df_bins_cpl.drop(columns=["cbin", "new_index"], inplace=True)

    print("\nsome lines of bins df:")
    print(df_bins_cpl.head(10))
    print(df_bins_cpl[df_bins_cpl["chrom"] == "2L"].tail())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].head())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "2R"].tail())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "3L"].head())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "3L"].tail())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "3R"].head())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "3R"].tail())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "4"].head())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "4"].tail())
    print(df_bins_cpl[df_bins_cpl["chrom"] == "X"].head())
    print(df_bins_cpl.tail())

    print("\nsome lines of pixels df:")
    print(df_pixels.head())
    print(df_pixels.tail())

    #write the cooler file
    cooler.create_cooler(outfile,
                         bins=df_bins_cpl,
                         pixels=df_pixels,
                         ordered=True,
                         metadata={"fromFilenames": [pixels, bins]})