def test_bystrand_bydistance_pileups_with_controls(request): """ Test the snipping on matrix: """ # Read cool file and create regions out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) regions = bf.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") features = bf.read_table(op.join(request.fspath.dirname, "data/toy_features.bed"), schema="bed") cc = CoordCreator( features, 1_000_000, features_format="bed", local=False, flank=2_000_000, mindist=0, ) pu = PileUpper(clr, cc, expected=False, view_df=regions, control=True) pup = pu.pileupsByStrandByDistanceWithControl() assert np.all( pup.sort_values(["orientation", "distance_band"])["n"] == [1, 2, 1, 1, 1])
def test_offdiag_pileups_without_expected(request): """ Test the snipping on matrix: """ # Read cool file and create regions out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) regions = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") # I. # Example region with windows, two regions from annotated genomic regions: windows1 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [102_000_000, 105_000_000], flank_bp=2_000_000) windows2 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [105_000_000, 109_000_000], flank_bp=2_000_000) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=("1", "2")) windows = cooltools.snipping.assign_regions(windows, regions).reset_index(drop=True) snipper = cooltools.snipping.CoolerSnipper(clr, regions=regions) stack = cooltools.snipping.pileup(windows, snipper.select, snipper.snip, map=map) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # II. # Example region with windows, second window comes from unannotated genomic region: windows1 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [102_000_000, 10_000_000], flank_bp=2_000_000) windows2 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [105_000_000, 109_000_000], flank_bp=2_000_000) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=("1", "2")) windows = cooltools.snipping.assign_regions(windows, regions).reset_index(drop=True) stack = cooltools.snipping.pileup(windows, snipper.select, snipper.snip, map=map) assert stack.shape == (5, 5, 2) assert np.all(np.isfinite(stack[:, :, 0])) assert np.all(np.isnan(stack[:, :, 1]))
def test_read_table(): d = """chr1\nchr2\nchr2""" assert bioframe.read_table(StringIO(d), schema="bed3").shape == (3, 3) # raise a value error if any columns are filled with all NA with pytest.raises(ValueError): bioframe.read_table(StringIO(d), schema="bed3", schema_is_strict=True) # fill with nans to appropriate size if schema_is_strict=False (aka the default) d = """chr1 5 10 chr1 10 20 chr2 30 40""" assert bioframe.read_table(StringIO(d), schema="bed3", sep="\s+").shape == (3, 3) assert bioframe.read_table(StringIO(d), schema="bed6", sep="\s+").shape == (3, 6) assert bioframe.read_table(StringIO(d), schema="bed12", sep="\s+").shape == (3, 12) # bedpe has 10 columns d = """chr1 5 10 chr2 5 10 interval1 . + - chr1 10 20 chr1 5 10 interval2 . + - chr2 30 40 chr2 5 10 interval3 12 + - """ assert bioframe.read_table(StringIO(d), schema="bedpe", sep="\s+", schema_is_strict=True).shape == (3, 10)
def read_viewframe( fname, verify_cooler_view=None, ): """ Read a BED file with regions that conforms a definition of a viewframe (non-overlaping, unique names, etc). Parameters ---------- fname : str Path to a BED file with regions. verify_cooler_view : None or viewframe Viewframe with entire chromosome sizes Returns ------- view_df : pd.DataFrame DataFrame with the viewframe """ # define chromsizes based on verify_cooler_view chromsizes = None if (verify_cooler_view is None) else \ verify_cooler_view.set_index("chrom")["end"] # read BED file assuming bed4/3 formats (with names-columns and without): try: view_df = bioframe.read_table(fname, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(fname, schema="bed3", index_col=False) # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df) if (verify_cooler_view is None) else \ bioframe.make_viewframe(view_df, check_bounds=chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. ") from e # Check that input view is contained in cooler bounds, but not vice versa (because cooler may have more regions): if verify_cooler_view is not None: if not bioframe.is_contained(view_df, verify_cooler_view): raise ValueError( "View regions are not contained in cooler chromsizes bounds") return view_df
def test_cooler_snipper_with_regions(request): clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) regions = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") snipper = cooltools.snipping.CoolerSnipper(clr, regions=regions) matrix = snipper.select("foo", "foo") snippet = snipper.snip( matrix, "foo", "foo", (110_000_000, 120_000_000, 110_000_000, 120_000_000)) assert snippet.shape is not None
def test_bystrand_pileups_with_expected(request): """ Test the snipping on matrix: """ # Read cool file and create regions out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) regions = io.read_viewframe_from_file( op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), verify_cooler=clr, ) exp = io.read_expected_from_file( op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv"), expected_value_cols=["balanced.avg"], verify_view=regions, verify_cooler=clr, ) features = bf.read_table(op.join(request.fspath.dirname, "data/toy_features.bed"), schema="bed") cc = CoordCreator( features, 1_000_000, features_format="bed", local=False, flank=2_000_000, mindist=0, ) # Test with ooe=True pu = PileUpper(clr, cc, expected=exp, view_df=regions, ooe=True) pup = pu.pileupsByStrandWithControl() assert np.all(pup.sort_values("orientation")["n"] == [1, 3, 1, 1]) # Test with ooe=False pu = PileUpper(clr, cc, expected=exp, view_df=regions, ooe=False) pup = pu.pileupsByStrandWithControl() assert np.all(pup.sort_values("orientation")["n"] == [1, 3, 1, 1]) # No regions provided without expected pu = PileUpper(clr, cc, expected=False, ooe=False) pup = pu.pileupsByStrandWithControl() assert np.all(pup.sort_values("orientation")["n"] == [1, 3, 1, 1]) # Unbalanced pu = PileUpper(clr, cc, expected=False, ooe=False, clr_weight_name=None, coverage_norm=True) pup = pu.pileupsByStrandWithControl() assert np.all(pup.sort_values("orientation")["n"] == [1, 3, 1, 1])
def test_ondiag_pileups_with_expected(request): """ Test the snipping on matrix: """ # Read cool file and create regions out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) exp = pd.read_table( op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")) regions = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") for snipper_class in ( cooltools.snipping.ObsExpSnipper, cooltools.snipping.ExpectedSnipper, ): snipper = snipper_class(clr, exp, regions=regions) # I. # Example region with windows, two regions from annotated genomic regions: windows = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [102_000_000, 105_000_000], flank_bp=2_000_000) windows = cooltools.snipping.assign_regions( windows, regions).reset_index(drop=True) stack = cooltools.snipping.pileup(windows, snipper.select, snipper.snip, map=map) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # II. # Example region with windows, second window comes from unannotated genomic region: windows = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [120_000_000, 160_000_000], flank_bp=2_000_000) windows = cooltools.snipping.assign_regions( windows, regions).reset_index(drop=True) stack = cooltools.snipping.pileup(windows, snipper.select, snipper.snip, map=map) assert stack.shape == (5, 5, 2) assert np.all(np.isnan(stack[:, :, 1]))
def test_ondiag_pileup_legacy_without_expected(request): """ Test the snipping on matrix: """ # Read cool file and create view_df out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) view_df = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") # I. # Example region with windows, two regions from annotated genomic regions: windows = cooltools.api.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [120_000_000, 160_000_000], flank_bp=2_000_000) windows = cooltools.api.snipping.assign_regions( windows, view_df).reset_index(drop=True) snipper = cooltools.api.snipping.CoolerSnipper(clr, view_df=view_df, min_diag=None) stack = cooltools.api.snipping.pileup_legacy(windows, snipper.select, snipper.snip, map=map) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # II. # Example region with windows, second window comes from unannotated genomic region: windows = cooltools.api.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [120_000_000, 160_000_000], flank_bp=2_000_000) windows = cooltools.api.snipping.assign_regions( windows, view_df).reset_index(drop=True) stack = cooltools.api.snipping.pileup_legacy(windows, snipper.select, snipper.snip, map=map) assert stack.shape == (5, 5, 2) assert np.all(np.isfinite(stack[:, :, 0])) assert np.all(np.isnan(stack[:, :, 1]))
def test_snipper_with_regions_and_expected(request): clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) exp = pd.read_table( op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")) regions = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") for snipper_class in ( cooltools.snipping.ObsExpSnipper, cooltools.snipping.ExpectedSnipper, ): snipper = snipper_class(clr, exp, regions=regions) matrix = snipper.select("foo", "foo") snippet = snipper.snip( matrix, "foo", "foo", (110_000_000, 120_000_000, 110_000_000, 120_000_000)) assert snippet.shape is not None
def test_offdiag_pileups_with_expected(request): """ Test the snipping on matrix: """ # Read cool file and create view_df out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) exp = pd.read_table( op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")) view_df = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") for snipper_class in ( cooltools.snipping.ObsExpSnipper, cooltools.snipping.ExpectedSnipper, ): snipper = snipper_class(clr, exp, view_df=view_df) # I. # Example region with windows, two off-diagonal features from annotated genomic regions: windows1 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [102_000_000, 105_000_000], flank_bp=2_000_000) windows2 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [105_000_000, 109_000_000], flank_bp=2_000_000) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=("1", "2")) windows = cooltools.snipping.assign_regions( windows, view_df).reset_index(drop=True) stack = cooltools.snipping.pileup_legacy(windows, snipper.select, snipper.snip, map=map) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # II. # Example region with windows, second window is between two different regions: windows1 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [102_000_000, 10_000_000], flank_bp=2_000_000) windows2 = cooltools.snipping.make_bin_aligned_windows( 1_000_000, ["chr1", "chr1"], [105_000_000, 109_000_000], flank_bp=2_000_000) windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=("1", "2")) windows = cooltools.snipping.assign_regions( windows, view_df).reset_index(drop=True) stack = cooltools.snipping.pileup_legacy(windows, snipper.select, snipper.snip, map=map) assert stack.shape == (5, 5, 2) assert np.all(np.isnan(stack[:, :, 1]))
def test_pileup(request): # Read cool file and create view_df out of it: clr = cooler.Cooler( op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")) exp = pd.read_table( op.join(request.fspath.dirname, "data/CN.mm9.toy_expected.tsv")) view_df = bioframe.read_table(op.join(request.fspath.dirname, "data/CN.mm9.toy_regions.bed"), schema="bed4") # I. # Example on-diagonal features, two regions from annotated genomic regions: windows = pd.DataFrame({ "chrom": ["chr1", "chr1"], "start": [102_000_000, 108_000_000], "end": [107_000_000, 113_000_000], }) stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # II. # Example off-diagonal features, two regions from annotated genomic regions: windows = pd.DataFrame({ "chrom1": ["chr1", "chr1"], "start1": [102_000_000, 107_000_000], "end1": [107_000_000, 112_000_000], "chrom2": ["chr1", "chr1"], "start2": [107_000_000, 113_000_000], "end2": [112_000_000, 118_000_000], }) stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) # III. # Example off-diagonal features, one region outside the view: windows = pd.DataFrame({ "chrom1": ["chr1", "chr1"], "start1": [90_000_000, 105_000_000], "end1": [95_000_000, 110_000_000], "chrom2": ["chr1", "chr1"], "start2": [105_000_000, 110_000_000], "end2": [110_000_000, 115_000_000], }) stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None) # Check that the size of snips is OK and there are two of them: assert stack.shape == (5, 5, 2) assert np.all(np.isnan(stack[:, :, 0])) # IV. # Example on-diagonal features, not valid bedframes (start>end): windows = pd.DataFrame({ "chrom": ["chr1", "chr1"], "start": [107_000_000, 108_000_000], "end": [102_000_000, 113_000_000], }) with pytest.raises(ValueError): stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None) # DRAFT # Should work with force=True: # stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None, force=True) # # Check that the size of snips is OK and there are two of them: # assert stack.shape == (5, 5, 2) # IV. # Example of-diagonal features not valid bedframes (start>end): windows = pd.DataFrame({ "chrom1": ["chr1", "chr1"], "start1": [107_000_000, 107_000_000], "end1": [102_000_000, 112_000_000], "chrom2": ["chr1", "chr1"], "start2": [107_000_000, 113_000_000], "end2": [112_000_000, 118_000_000], }) with pytest.raises(ValueError): stack = cooltools.api.snipping.pileup(clr, windows, view_df, exp, flank=None)
def read_viewframe_from_file( view_fname, verify_cooler=None, check_sorting=False, ): """ Read a BED file with regions that conforms a definition of a viewframe (non-overlaping, unique names, etc). Parameters ---------- view_fname : str Path to a BED file with regions. verify_cooler : cooler | None cooler object to get chromsizes for bound checking No checks are done when None. check_sorting : bool Check is regions in view_df are sorted as in chromosomes in cooler. Returns ------- view_df : pd.DataFrame DataFrame with the viewframe """ # read BED file assuming bed4/3 formats (with names-columns and without): try: view_df = bioframe.read_table(view_fname, schema="bed4", index_col=False) except Exception as err_bed4: try: view_df = bioframe.read_table(view_fname, schema="bed3", index_col=False) except Exception as err_bed3: raise ValueError( f"{view_fname} is not a BED file with 3 or 4 columns" ) from err_bed4 # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. ") from e if verify_cooler is not None: try: _ = is_compatible_viewframe(view_df, verify_cooler, check_sorting, raise_errors=True) except Exception as e: raise ValueError( "view_df is not compatible with the cooler") from e else: # view_df is compaible, returning return view_df else: # no cooler for checking, returning return view_df
def compute_pileup( cool_path, features, view, expected, flank, features_format, weight_name, out, out_format, store_snips, nproc, ignore_diags, aggregate, force, verbose, ): """ Perform retrieval of the snippets from .cool file. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. FEATURES_PATH : the path to a BED or BEDPE-like file that contains features for snipping windows. If BED, then the features are on-diagonal. If BEDPE, then the features can be off-diagonal (but not in trans or between different regions in the view). """ clr = cooler.Cooler(cool_path) #### Read the features: buf, names = sniff_for_header(features) if features_format.lower() == "bedpe": default_cols = [0, 1, 2, 3, 4, 5] bedpe_cols = ["chrom1", "start1", "end1", "chrom2", "start2", "end2"] dtypes = { "chrom1": str, "start1": np.int64, "end1": np.int64, "chrom2": str, "start2": np.int64, "end2": np.int64, } if names is None: kwargs = dict( header=None, usecols=default_cols, dtype=dtypes, names=bedpe_cols, ) else: kwargs = dict(header="infer", usecols=bedpe_cols) elif features_format.lower() == "bed": default_cols = [0, 1, 2] bed_cols = ["chrom", "start", "end"] dtypes = {"chrom": str, "start": np.int64, "end": np.int64} if names is None: kwargs = dict( header=None, names=bed_cols, ) else: kwargs = dict(header="infer", usecols=bed_cols) else: raise ValueError( "Automatic detection of features format is not implemented yet. " "Please provide BED or BEDPE as --features-format") features_df = pd.read_table(buf, comment="#", usecols=default_cols, dtype=dtypes, verbose=verbose, **kwargs) ###### Define view for cis compartment-calling # use input "view" BED file or all chromosomes mentioned in "track": if view is None: # Generate viewframe from clr.chromsizes: view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames]) if not bioframe.is_contained(features_df, view_df): raise ValueError( "Features are not contained in chromosomes bounds") else: # Make viewframe out of table: # Read view_df: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view_df to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e if not bioframe.is_contained(features_df, view_df): raise ValueError("Features are not contained in view bounds") ##### Read expected, should be cis-expected: if not expected is None: expected_path, expected_value_col = expected expected_summary_cols = [ expected_value_col, ] expected = read_expected( expected_path, contact_type="cis", expected_value_cols=expected_summary_cols, verify_view=view_df, verify_cooler=clr, ) ##### CReate the pileup: stack = snipping.pileup( clr, features_df, view_df=view_df, expected_df=expected, flank=flank, min_diag=ignore_diags, # TODO: implement in pileup API clr_weight_name=weight_name, # TODO: implement in pileup API force=force, # TODO: implement in pileup API nproc=nproc, ) ##### Aggregate the signal: aggregate = aggregate.lower() if aggregate is None or aggregate == "mean" or aggregate == "none": agg_func = np.nanmean elif aggregate == "median": agg_func = np.nanmedian elif aggregate == "min": agg_func = np.nanmin elif aggregate == "max": agg_func = np.nanmax elif aggregate == "std": agg_func = np.nanstd else: raise ValueError( f"Aggregation mode {aggregate} not supported. Please use mean/median/min/max/std." ) pileup = agg_func(stack, axis=2) ##### Store the data as NPZ file: if out_format.lower() == "npz": if store_snips: np.savez(out, pileup=pileup) else: np.savez(out, pileup=pileup, stack=stack) elif out_format.lower() == "hdf5": h5 = h5py.File(out, "w") h5.create_dataset("pileup", data=pileup) if store_snips: h5.create_dataset("stack", data=stack)
def compute_expected( cool_path, nproc, chunksize, output, contact_type, view, balance, clr_weight_name, ignore_diags, ): """ Calculate expected Hi-C signal either for cis or for trans regions of chromosomal interaction map. When balancing weights are not applied to the data, there is no masking of bad bins performed. COOL_PATH : The paths to a .cool file with a balanced Hi-C map. """ clr = cooler.Cooler(cool_path) if view is not None: # Read view_df dataframe: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view dataframe to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e else: view_df = None # full chromosome case if contact_type == "cis": result = expected.get_cis_expected( clr, view_df=view_df, intra_only=True, clr_weight_name=clr_weight_name if balance else None, ignore_diags=ignore_diags, chunksize=chunksize, nproc=nproc ) elif contact_type == "trans": result = expected.get_trans_expected( clr, view_df=view_df, clr_weight_name=clr_weight_name if balance else None, chunksize=chunksize, nproc=nproc, ) # output to file if specified: if output: result.to_csv(output, sep="\t", index=False, na_rep="nan") # or print into stdout otherwise: else: print(result.to_csv(sep="\t", index=False, na_rep="nan"))
def call_compartments( cool_path, reference_track, view, contact_type, n_eigs, verbose, out_prefix, bigwig, ): """ Perform eigen value decomposition on a cooler matrix to calculate compartment signal by finding the eigenvector that correlates best with the phasing track. COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the '::' syntax to specify a group path in a multicooler file. TRACK_PATH : the path to a BedGraph-like file that stores phasing track as track-name named column. BedGraph-like format assumes tab-separated columns chrom, start, stop and track-name. """ clr = cooler.Cooler(cool_path) if reference_track is not None: # TODO: This all needs to be refactored into a more generic tabular file parser # Needs to handle stdin case too. track_path, col = reference_track buf, names = sniff_for_header(track_path) if names is None: if not isinstance(col, int): raise click.BadParameter( "No header found. " 'Cannot find "{}" column without a header.'.format(col)) track_name = "ref" kwargs = dict( header=None, usecols=[0, 1, 2, col], names=["chrom", "start", "end", track_name], ) else: if isinstance(col, int): try: col = names[col] except IndexError: raise click.BadParameter( 'Column #{} not compatible with header "{}".'.format( col, ",".join(names))) else: if col not in names: raise click.BadParameter( 'Column "{}" not found in header "{}"'.format( col, ",".join(names))) track_name = col kwargs = dict(header="infer", usecols=["chrom", "start", "end", track_name]) track_df = pd.read_table(buf, dtype={ "chrom": str, "start": np.int64, "end": np.int64, track_name: np.float64, }, comment="#", verbose=verbose, **kwargs) # we need to merge phasing track DataFrame with the cooler bins to get # a DataFrame with phasing info aligned and validated against bins inside of # the cooler file. track = pd.merge(left=clr.bins()[:], right=track_df, how="left", on=["chrom", "start", "end"]) # sanity check would be to check if len(bins) becomes > than nbins ... # that would imply there was something in the track_df that didn't match # ["chrom", "start", "end"] - keys from the c.bins()[:] . if len(track) > len(clr.bins()): ValueError( "There is something in the {} that ".format(track_path) + "couldn't be merged with cooler-bins {}".format(cool_path)) else: # use entire bin-table from cooler, when reference-track is not provided: track = clr.bins()[["chrom", "start", "end"]][:] track_name = None # define view for cis compartment-calling # use input "view" BED file or all chromosomes mentioned in "track": if view is None: # Generate viewframe from clr.chromsizes: view_df = bioframe.make_viewframe([(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames]) else: # Make viewframe out of table: # Read view_df: try: view_df = bioframe.read_table(view, schema="bed4", index_col=False) except Exception: view_df = bioframe.read_table(view, schema="bed3", index_col=False) # Convert view_df to viewframe: try: view_df = bioframe.make_viewframe(view_df, check_bounds=clr.chromsizes) except ValueError as e: raise ValueError( "View table is incorrect, please, comply with the format. " ) from e # TODO: Add check that view_df has the same bins as track # it's contact_type dependent: if contact_type == "cis": eigvals, eigvec_table = eigdecomp.cooler_cis_eig( clr=clr, bins=track, view_df=view_df, n_eigs=n_eigs, phasing_track_col=track_name, clip_percentile=99.9, sort_metric=None, ) elif contact_type == "trans": eigvals, eigvec_table = eigdecomp.cooler_trans_eig( clr=clr, bins=track, n_eigs=n_eigs, partition=None, phasing_track_col=track_name, sort_metric=None, ) # Output eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt", sep="\t", index=False) eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv", sep="\t", index=False) if bigwig: bioframe.to_bigwig( eigvec_table, clr.chromsizes, out_prefix + "." + contact_type + ".bw", value_field="E1", )