コード例 #1
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_bedslice():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    grouped = bins.groupby('chrom')
    df = util.bedslice(grouped, chromsizes, 'chr1:0-12')
    assert df['chrom'].tolist() == ['chr1', 'chr1']
    assert df['start'].tolist() == [0, 10]
コード例 #2
0
def prepare_snakemake(allc_table_path, output_dir, chrom_sizes_path, template_path, chroms=None, smoothing=True,
                      chunk_size=50000000):
    output_dir = pathlib.Path(output_dir).absolute()
    output_dir.mkdir(exist_ok=True)

    allc_table = pd.read_csv(allc_table_path, sep='\t')
    allc_table.columns = ['allc_path', 'sample', 'group']

    if allc_table['group'].unique().size != 2:
        raise ValueError(
            f"There must be two and only two different groups, got {allc_table['group'].unique().size}."
        )
    group1, group2 = allc_table['group'].unique()
    group1_allc = allc_table.loc[allc_table['group'] == group1,
                                 'allc_path'].tolist()
    group2_allc = allc_table.loc[allc_table['group'] == group2,
                                 'allc_path'].tolist()
    group1_id = allc_table.loc[allc_table['group'] == group1, 'sample'].tolist()
    group2_id = allc_table.loc[allc_table['group'] == group2, 'sample'].tolist()

    chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms)
    if chroms is None:
        chroms = chrom_sizes.index.tolist()
    bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size)
    regions = []
    for _, (chrom, start, end) in bins.iterrows():
        region = f'{chrom}:{start}-{end}'
        regions.append(region)

    for region in regions:
        config_path = f'{output_dir}/{region}.yaml'
        parameters = dict(region=region,
                          allc_paths=group1_allc + group2_allc,
                          group1=group1_id,
                          group2=group2_id,
                          smoothing=smoothing)
        with open(config_path, 'w') as f:
            f.write(yaml.dump(parameters))

    snakefile = f"""
regions = {regions}
rule main:
    input:
        expand('{{region}}.DSS.DML.hdf', region=regions)

rule papermill:
    input:
        nb='{template_path}',
        config='{{region}}.yaml'
    output:
        nb='{{region}}.ipynb',
        data='{{region}}.DSS.DML.hdf'
    shell:
        'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10'
"""
    snakefile_path = f'{output_dir}/Snakefile'
    with open(snakefile_path, 'w') as f:
        f.write(snakefile)
    return snakefile_path
コード例 #3
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_genome_segmentation():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    gs = util.GenomeSegmentation(chromsizes, bins)
    df = gs.fetch('chr1')
    assert len(df) == 4
    df = gs.fetch('chr1:2-30')
    assert len(df) == 3
    util.balanced_partition(gs, 2, ['chr1'])
コード例 #4
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_get_binsize():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert util.get_binsize(bins) == 10

    # variable-sized bins
    bins = pd.read_csv(op.join(datadir, 'toy.bins.var.bed'),
                       names=['chrom', 'start', 'end'],
                       sep='\t')
    assert util.get_binsize(bins) is None

    # ambiguous case: one bin per chromosome with different lengths
    bins = pd.DataFrame({
        'chrom': ['chr1', 'chr2', 'chr3'],
        'start': [0, 0, 0],
        'end': [100, 200, 300]
    })
    assert util.get_binsize(bins) is None
コード例 #5
0
    def __init__(self,
                 datasets,
                 outfil,
                 assembly='hg38',
                 chromsizes_file=None,
                 chroms=['#', 'X'],
                 onlyIntra=True):

        self.outfil = os.path.abspath(os.path.expanduser(outfil))
        if os.path.exists(self.outfil):
            log.error('Cooler file {} already exists, exit ...'.format(
                self.outfil))
            sys.exit(1)
        self.chroms = set(chroms)
        self.onlyIntra = onlyIntra
        data = datasets

        ## Ready for data loading
        if not chromsizes_file is None:
            chromsizes_path = os.path.abspath(
                os.path.expanduser(chromsizes_file))
            log.info('Read chromosome sizes from {}'.format(chromsizes_path))
            chromsizes = readChromSizes(chromsizes_path, self.chroms)
        else:
            log.info('Fetch chromosome sizes from UCSC ...')
            chromsizes = fetchChromSizes(assembly, self.chroms)
        chromlist = chromsizes.keys()
        # sort chromosome labels
        tmp = list(
            map(str, sorted(map(int, [i for i in chromlist if i.isdigit()]))))
        nondigits = [i for i in chromlist if not i.isdigit()]
        for i in ['X', 'Y', 'M']:
            if i in nondigits:
                tmp.append(nondigits.pop(nondigits.index(i)))
        chromlist = tmp + sorted(nondigits)
        lengths = [chromsizes[i] for i in chromlist]
        self.chromsizes = pd.Series(data=lengths, index=chromlist)
        log.info('Done')

        ## We don't read data into memory at this point.
        ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels.
        self.Map = {}
        for res in data:
            if data[res].endswith('.npz'):
                self.Map[res] = {}
                lib = np.load(data[res])
                for i in lib.files:
                    if (not '_' in i) and ((not self.chroms) or
                                           (i.isdigit() and '#' in self.chroms)
                                           or (i in self.chroms)):
                        # Compatible with TADLib and old version of runHiC
                        c1 = c2 = i
                        self.Map[res][(c1, c2)] = lib
                    else:
                        tmp = i.split('_')
                        if len(tmp) != 2:
                            continue
                        c1, c2 = tmp
                        check1 = ((not self.chroms)
                                  or (c1.isdigit() and '#' in self.chroms)
                                  or (c1 in self.chroms))
                        check2 = ((not self.chroms)
                                  or (c2.isdigit() and '#' in self.chroms)
                                  or (c2 in self.chroms))
                        if check1 and check2:
                            self.Map[res][(c1, c2)] = lib
            else:
                self.Map[res] = self._scanFolder(data[res])

        self._intertype = np.dtype({
            'names': ['bin1', 'bin2', 'IF'],
            'formats': [np.int, np.int, np.float]
        })

        log.info(
            'Extract and save data into cooler format for each resolution ...')
        for res in self.Map:
            log.info('Current resolution: {}bp'.format(res))
            byres = self.Map[res]
            # Extract parts of chromsizes
            subset = []
            for c1, c2 in byres:
                subset.extend([c1, c2])
            subset = set(subset)
            Bool = [(i in subset) for i in self.chromsizes.index]
            chromsizes = self.chromsizes[Bool]
            bin_cumnums = self.binCount(chromsizes, res)
            log.info('Generate bin table ...')
            bintable = binnify(chromsizes, res)
            pixels = self._generator(byres, chromsizes, bin_cumnums)
            if os.path.exists(self.outfil):
                append = True
            else:
                append = False
            cooler_uri = '{}::{}'.format(self.outfil, res)
            if self.onlyIntra:
                create(cooler_uri,
                       bintable,
                       pixels,
                       assembly=assembly,
                       append=append,
                       boundscheck=False,
                       triucheck=False,
                       dupcheck=False,
                       ensure_sorted=False,
                       metadata={'onlyIntra': str(self.onlyIntra)})
            else:
                create_from_unordered(
                    cooler_uri,
                    bintable,
                    pixels,
                    assembly=assembly,
                    append=append,
                    metadata={'onlyIntra': str(self.onlyIntra)},
                    delete_temp=True,
                    boundscheck=False,
                    triucheck=False,
                    dupcheck=False,
                    ensure_sorted=False)
コード例 #6
0
ファイル: utilities.py プロジェクト: YunXiaLiu/HiCPeaks
def toCooler(outfil,
             data_path,
             res,
             assembly,
             chroms=['#', 'X'],
             symmetric=True,
             count_type=float,
             cache_dir=None,
             delete_cache=True):
    """
    Create a Cooler from TXT Hi-C data.
    
    Parameters
    ----------
    outfil : str
        Path of the output Cooler file.
        
    data_path : str
        Path of original contact matrix file (in TXT format).
        
    res : int
        Resolution / Bin size of the matrix in base pairs.
    
    chroms : list
        List of chromosome labels. Only Hi-C data within the specified chromosomes
        will be included. Specially, '#' stands for chromosomes with numerical
        labels. If an empty list is provided, all chromosome data will be loaded.
        (Default: ['#', 'X'])
    
    cache_dir : str or None
        All intermediate or temporary files would be generated under this folder.
        If None, the folder returned by :py:func:`tempfile.gettempdir` will be
        used. (Default: None)
    
    delete_cache : Bool
        Whether to delete temporary files when finished. (Default: True)
    
    
    """
    outfil = os.path.abspath(os.path.expanduser(outfil))
    if os.path.exists(outfil):
        log.error('Cooler file {} already exists, exit ...')
        sys.exit(1)

    ## ready for data loading
    tl = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
    kw = {'prefix': 'pixels', 'suffix': tl, 'dir': cache_dir}
    pixel_fil = tempfile.mktemp(**kw)

    # write the pixel file
    chromsizes = {}
    with open(pixel_fil, 'wb') as out:
        with open(data_path, 'rb') as source:
            for line in source:
                c1, p1, c2, p2, count = line.rstrip().split()
                c1 = c1.lstrip('chr')
                c2 = c2.lstrip('chr')
                check1 = ((not chroms) or (c1.isdigit() and ('#' in chroms))
                          or (c1 in chroms))
                check2 = ((not chroms) or (c2.isdigit() and ('#' in chroms))
                          or (c2 in chroms))
                if (not check1) or (not check2):
                    continue
                ip_1, ip_2 = int(p1) + res, int(p2) + res
                if c1 in chromsizes:
                    if ip_1 > chromsizes[c1]:
                        chromsizes[c1] = ip_1
                else:
                    chromsizes[c1] = ip_1
                if c2 in chromsizes:
                    if ip_2 > chromsizes[c2]:
                        chromsizes[c2] = ip_2
                else:
                    chromsizes[c2] = ip_2
                newline = [c1, p1, str(ip_1), c2, p2, str(ip_2), count]
                out.write('\t'.join(newline) + '\n')

    # generate bin table
    chromlist = chromsizes.keys()
    # sort chromosome labels
    tmp = map(str, sorted(map(int, [i for i in chromlist if i.isdigit()])))
    nondigits = [i for i in chromlist if not i.isdigit()]
    for i in ['X', 'Y', 'M']:
        if i in nondigits:
            tmp.append(nondigits.pop(nondigits.index(i)))
    chromlist = tmp + sorted(nondigits)
    lengths = [chromsizes[i] for i in chromlist]
    chromsizes = pd.Series(data=lengths, index=chromlist)
    bins = binnify(chromsizes, res)

    # output fields
    output_field_names = ['bin1_id', 'bin2_id', 'count']
    output_field_dtypes = {'bin1_id': int, 'bin2_id': int, 'count': count_type}

    # input fields
    input_field_names = [
        'chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'count'
    ]
    input_field_dtypes = {
        'chrom1': str,
        'start1': int,
        'end1': int,
        'chrom2': str,
        'start2': int,
        'end2': int,
        'count': count_type,
    }
    input_field_numbers = {
        'chrom1': 0,
        'start1': 1,
        'end1': 2,
        'chrom2': 3,
        'start2': 4,
        'end2': 5,
        'count': 6,
    }

    tril_action = 'drop' if symmetric else 'reflect'
    pipeline = sanitize_records(bins,
                                schema='bg2',
                                is_one_based=False,
                                tril_action=tril_action,
                                sort=True)

    reader = pd.read_table(
        pixel_fil,
        usecols=[input_field_numbers[name] for name in input_field_names],
        names=input_field_names,
        dtype=input_field_dtypes,
        iterator=True,
        chunksize=int(40e6))

    create_from_unordered(outfil,
                          bins,
                          map(pipeline, reader),
                          columns=output_field_names,
                          dtypes=output_field_dtypes,
                          assembly=assembly,
                          mergebuf=int(40e6),
                          ensure_sorted=False)
コード例 #7
0
ファイル: utilities.py プロジェクト: XiaoTaoWang/HiCPeaks
    def __init__(self, datasets, outfil, assembly='hg38', chromsizes_file=None, chroms=['#','X'], onlyIntra=True,
        dtype='int'):

        self.outfil = os.path.abspath(os.path.expanduser(outfil))
        if os.path.exists(self.outfil):
            log.error('Cooler file {} already exists, exit ...'.format(self.outfil))
            sys.exit(1)
        self.chroms = set(chroms)
        self.onlyIntra = onlyIntra
        data = datasets

        ## Ready for data loading
        if not chromsizes_file is None:
            chromsizes_path = os.path.abspath(os.path.expanduser(chromsizes_file))
            log.info('Read chromosome sizes from {}'.format(chromsizes_path))
            chromsizes = readChromSizes(chromsizes_path, self.chroms)
        else:
            log.info('Fetch chromosome sizes from UCSC ...')
            chromsizes = fetchChromSizes(assembly, self.chroms)
        chromlist = chromsizes.keys()
        # sort chromosome labels
        tmp = list(map(str, sorted(map(int, [i for i in chromlist if i.isdigit()]))))
        nondigits = [i for i in chromlist if not i.isdigit()]
        for i in ['X','Y','M']:
            if i in nondigits:
                tmp.append(nondigits.pop(nondigits.index(i)))
        chromlist = tmp + sorted(nondigits)
        lengths = [chromsizes[i] for i in chromlist]
        self.chromsizes = pd.Series(data=lengths, index=chromlist)
        log.info('Done')

        ## We don't read data into memory at this point.
        ## Waiting for more robust conditions, here I assume there is no sign '_' in any chromosome labels.
        self.Map = {}
        for res in data:
            if data[res].endswith('.npz'):
                self.Map[res] = {}
                lib = np.load(data[res])
                for i in lib.files:
                    if (not '_' in i) and ((not self.chroms) or (i.isdigit() and '#' in self.chroms) or (i in self.chroms)):
                        # Compatible with TADLib and old version of runHiC
                        c1 = c2 = i
                        self.Map[res][(c1,c2)] = lib
                    else:
                        tmp = i.split('_')
                        if len(tmp)!=2:
                            continue
                        c1, c2 = tmp
                        check1 = ((not self.chroms) or (c1.isdigit() and '#' in self.chroms) or (c1 in self.chroms))
                        check2 = ((not self.chroms) or (c2.isdigit() and '#' in self.chroms) or (c2 in self.chroms))
                        if check1 and check2:
                            self.Map[res][(c1,c2)] = lib
            else:
                self.Map[res] = self._scanFolder(data[res])

        self._intertype = np.dtype({'names':['bin1', 'bin2', 'IF'],
                                    'formats':[np.int, np.int, np.float]})
        
        log.info('Extract and save data into cooler format for each resolution ...')
        for res in self.Map:
            log.info('Current resolution: {}bp'.format(res))
            byres = self.Map[res]
            # Extract parts of chromsizes
            subset = []
            for c1, c2 in byres:
                subset.extend([c1,c2])
            subset = set(subset)
            Bool = [(i in subset) for i in self.chromsizes.index]
            chromsizes = self.chromsizes[Bool]
            bin_cumnums = self.binCount(chromsizes, res)
            log.info('Generate bin table ...')
            bintable = binnify(chromsizes, res)
            pixels = self._generator(byres, chromsizes, bin_cumnums)
            if os.path.exists(self.outfil):
                mode = 'a'
            else:
                mode = 'w'
            if dtype == 'int':
                dtypes = {'count': np.int32}
            else:
                dtypes = {'count': np.float64}
            cooler_uri = '{}::{}'.format(self.outfil, res)
            if self.onlyIntra:
                create_cooler(cooler_uri, bintable, pixels, assembly=assembly, mode=mode,
                       boundscheck=False, triucheck=False, dupcheck=False, ensure_sorted=False,
                       ordered=True, metadata={'onlyIntra':str(self.onlyIntra)}, dtypes=dtypes)
            else:
                create_from_unordered(cooler_uri, bintable, pixels, assembly=assembly,
                                      mode=mode, metadata={'onlyIntra':str(self.onlyIntra)},
                                      delete_temp=True, boundscheck=False, triucheck=False,
                                      dupcheck=False, ensure_sorted=False, dtypes=dtypes)
コード例 #8
0
ファイル: contacts.py プロジェクト: nanoporetech/pore-c
def export_to_cooler(
    contact_table,
    output_prefix,
    cooler_resolution,
    fragment_table,
    chromsizes,
    query,
    query_columns=None,
    by_haplotype=False,
):

    results = []
    if query_columns:
        columns = query_columns[:]
    else:
        columns = []
    columns.extend(["align1_fragment_id", "align2_fragment_id"])
    if by_haplotype:
        columns.extend(["align1_haplotype", "align2_haplotype"])
    contact_df = dd.read_parquet(contact_table,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=columns,
                                 index=False)
    if query:
        contact_df = contact_df.query(query)

    chrom_dict = pd.read_csv(chromsizes,
                             sep="\t",
                             header=None,
                             names=["chrom", "size"],
                             index_col=["chrom"],
                             squeeze=True)
    # create even-widht bins using cooler
    bins_df = binnify(chrom_dict, cooler_resolution)
    bins_df.index.name = "bin_id"
    # convert to ranges for overlap
    bins = pr.PyRanges(bins_df.reset_index().rename(columns={
        "start": "Start",
        "end": "End",
        "chrom": "Chromosome"
    }))

    fragment_df = dd.read_parquet(fragment_table,
                                  engine=PQ_ENGINE,
                                  version=PQ_VERSION).compute()
    midpoint_df = pr.PyRanges(
        fragment_df.reset_index()[[
            "chrom", "start", "end", "fragment_id"
        ]].assign(start=lambda x: ((x.start + x.end) * 0.5).round(0).astype(
            int)).eval("end = start + 1").rename(columns={
                "chrom": "Chromosome",
                "start": "Start",
                "end": "End"
            }))
    # use a pyranges joing to assign fragments to bins
    fragment_to_bin = midpoint_df.join(
        bins, how="left").df[["fragment_id", "bin_id"]]
    fragment_to_bin = fragment_to_bin.set_index(
        "fragment_id").sort_index()  # .astype(np.uint32)
    nulls = fragment_to_bin["bin_id"] == -1
    if nulls.any():
        logger.warning(
            "Some fragments did not overlap bins, removing from analysis:\n{}".
            format(fragment_to_bin[nulls].join(fragment_df)))
        fragment_to_bin = fragment_to_bin[~nulls]

    # use a join to assign each end of a contact to a bin
    binned_contacts = (contact_df.merge(
        fragment_to_bin,
        how="inner",
        right_index=True,
        left_on="align1_fragment_id").merge(
            fragment_to_bin,
            how="inner",
            right_index=True,
            left_on="align2_fragment_id",
            suffixes=[None, "_2"]).rename(columns={
                "bin_id": "bin1_id",
                "bin_id_2": "bin2_id"
            }))

    if not by_haplotype:
        cooler_path = output_prefix + ".cool"
        # group size == number of contacts per bin_pair
        pixels = binned_contacts.groupby(
            ["bin1_id",
             "bin2_id"]).size().rename("count").astype(np.int32).reset_index()
        create_cooler(cooler_path,
                      bins_df,
                      pixels,
                      ordered=True,
                      symmetric_upper=True,
                      ensure_sorted=True)
        c = Cooler(cooler_path)
        logger.info(f"Created cooler: {c.info}")
        results.append(cooler_path)
    else:
        tmp_parquet = output_prefix + ".tmp.pq"
        pixels = (
            # create a key to groupy by haplotype pair, order of haplotypes doesn't matter
            binned_contacts.assign(
                hap_key=lambda x: x[["align1_haplotype", "align2_haplotype"]
                                    ].apply(lambda y: "{}_{}".format(*sorted(
                                        y)).replace("-1", "nohap"),
                                            axis=1,
                                            meta="object")
            ).groupby(["hap_key", "bin1_id",
                       "bin2_id"]).size().rename("count").astype(
                           np.int32
                       ).reset_index().astype({"hap_key": "category"}))

        # save to a temporary parquet file, this might not be necessary
        # but want to avoid the whole contact matrix hitting memory
        pixels.to_parquet(
            tmp_parquet,
            write_metadata_file=True,
            partition_on=["hap_key"],
            write_index=False,
            engine=PQ_ENGINE,
            version=PQ_VERSION,
        )

        pixels = dd.read_parquet(tmp_parquet,
                                 engine=PQ_ENGINE,
                                 version=PQ_VERSION,
                                 columns=["hap_key"],
                                 index=False)
        hap_keys = pixels["hap_key"].unique().compute()
        # create a cooler for each haplotype pair
        for hap_key in hap_keys:
            cooler_path = f"{output_prefix}.{hap_key}.cool"
            pixels = dd.read_parquet(
                tmp_parquet,
                filters=[("hap_key", "==", hap_key)],
                index=False,
                engine=PQ_ENGINE,
                version=PQ_VERSION,
                columns=["bin1_id", "bin2_id", "count"],
            )
            create_cooler(cooler_path,
                          bins_df,
                          pixels,
                          ordered=True,
                          symmetric_upper=True,
                          ensure_sorted=True)
            c = Cooler(cooler_path)
            logger.info(f"Created cooler: {c.info}")
            results.append(cooler_path)

        shutil.rmtree(tmp_parquet)

    return results
コード例 #9
0
def prepare_snakemake(allc_table_path,
                      output_dir,
                      chrom_sizes_path,
                      template_path,
                      chroms=None,
                      test_covariate='group',
                      match_covariate=None,
                      adjust_covariate=None,
                      cutoff=0.1,
                      min_num_region=3,
                      smooth=True,
                      bp_span=1000,
                      min_in_span=30,
                      max_gap_smooth=2500,
                      max_gap=1000,
                      verbose=True,
                      max_perms=10,
                      stat="stat",
                      block=False,
                      block_size=5000,
                      chrs_per_chunk=1,
                      cpu=40,
                      chunk_size=5000000000):
    output_dir = pathlib.Path(output_dir).absolute()
    output_dir.mkdir(exist_ok=True)

    chrom_sizes = read_chromsizes(chrom_sizes_path).reindex(chroms)
    if chroms is None:
        chroms = chrom_sizes.index.tolist()
    bins = binnify(chrom_sizes.loc[chroms], binsize=chunk_size)
    regions = []
    for _, (chrom, start, end) in bins.iterrows():
        region = f'{chrom}:{start}-{end}'
        regions.append(region)

    for region in regions:
        config_path = f'{output_dir}/{region}.yaml'
        parameters = {
            'region': region,
            'allc_table_path': allc_table_path,
            'test_covariate': test_covariate,
            'match_covariate': match_covariate,
            'adjust_covariate': adjust_covariate,
            'cutoff': cutoff,
            'min_num_region': min_num_region,
            'smooth': smooth,
            'bp_span': bp_span,
            'min_in_span': min_in_span,
            'max_gap_smooth': max_gap_smooth,
            'max_gap': max_gap,
            'verbose': verbose,
            'max_perms': max_perms,
            'stat': stat,
            'block': block,
            'block_size': block_size,
            'chrs_per_chunk': chrs_per_chunk,
            'cpu': cpu
        }
        with open(config_path, 'w') as f:
            f.write(yaml.dump(parameters))

    snakefile = f"""
regions = {regions}
rule main:
    input:
        expand('{{region}}.DMR.hdf', region=regions)

rule papermill:
    input:
        nb='{template_path}',
        config='{{region}}.yaml'
    output:
        nb='{{region}}.ipynb',
        data='{{region}}.DMR.hdf'
    threads:
        1 #{cpu}
    shell:
        'papermill {{input.nb}} {{output.nb}} -f {{input.config}} && sleep 10'
"""
    snakefile_path = f'{output_dir}/Snakefile'
    with open(snakefile_path, 'w') as f:
        f.write(snakefile)
    return regions
コード例 #10
0
                lock.acquire()
                print("right before collapse ...{}  {}".format(
                    i, spans[i:i + batchsize]))
                results = self._map(self.aggregate, spans[i:i + batchsize])
            finally:
                lock.release()
            for df in results:
                # yield {k: v.values for k, v in six.iteritems(df)}
                yield df


input_uri = ""

c = Cooler(input_uri)

new_bins = binnify(c.chromsizes, 2 * c.binsize)

iterator = CoolerAggregator(input_uri, new_bins, 1000000, batchsize=1, map=map)

# # last message before it fails ...
# # INFO:cooler:17868809 17872380
# for ii in iterator:
#     print(ii)

# from cooler.api import Cooler
lo, hi = 17869999, 17872300
# lo, hi = 17868809, 17872380

clr = Cooler(input_uri)
# convert_enum=False returns chroms as raw ints
table = clr.pixels(join=True, convert_enum=False)
コード例 #11
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_check_bins():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    bins['chrom'] = bins['chrom'].astype(str)
    bins = util.check_bins(bins, chromsizes)
    assert pd.api.types.is_categorical(bins["chrom"])
コード例 #12
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_get_chromsizes():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert np.allclose(util.get_chromsizes(bins), chromsizes)
コード例 #13
0
ファイル: test_util.py プロジェクト: zhuakexi/cooler
def test_binnify():
    chromsizes = util.read_chromsizes(op.join(datadir, 'toy.chrom.sizes'))
    bins = util.binnify(chromsizes, 10)
    assert len(bins) == 8