def _size(self, block): import pypairix f = pypairix.open(self.filepath, 'r') chrom1, chrom2 = block return sum( 1 for line in f.query2D(chrom1, 0, self.gs.chromsizes[chrom1], chrom2, 0, self.gs.chromsizes[chrom2], 1))
def __init__(self, filepath, chromsizes, bins, map=map, **kwargs): try: import pypairix except ImportError: raise ImportError( "pypairix is required to read pairix-indexed files") self._map = map f = pypairix.open(filepath, 'r') self.C1 = f.get_chr1_col() self.C2 = f.get_chr2_col() self.P1 = f.get_startpos1_col() self.P2 = f.get_startpos2_col() self.file_contigs = set( itertools.chain.from_iterable( [b.split('|') for b in f.get_blocknames()])) # all requested contigs will be placed in the output matrix self.gs = GenomeSegmentation(chromsizes, bins) # find available contigs in the contact list self.filepath = filepath self.n_records = None # warn about requested contigs not seen in the contact list for chrom in self.gs.contigs: if chrom not in self.file_contigs: warnings.warn("Did not find contig " + " '{}' in contact list file.".format(chrom))
class PairixTest2D_4DN(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D_4DN) regions = read_pairs(TEST_FILE_2D_4DN, f_type) chrom = 'chr21' start = 1 end = 48129895 chrom2 = 'chr22' start2 = 1 end2 = 51304566 # reverse reversed results to get them in the required order here result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2) pr = pypairix.open(TEST_FILE_2D_4DN) def test_query2_4dn(self): it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2_4dn(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result)
class PairixTest2D(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D) regions = read_pairs(TEST_FILE_2D, f_type) chrom = '10' start = 1 end = 1000000 chrom2 = '20' start2 = 50000000 end2 = 60000000 result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2) pr = pypairix.open(TEST_FILE_2D) def test_query2(self): it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2_bad_order(self): # build the query with coordinates in the wrong order query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.end, self.start, self.chrom2, self.start2, self.end2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") # trigger a warning self.pr.querys2D(query) # verify some things about the warning self.assertEqual(len(w), 1) self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))
def test_build_index_with_force(self): ## recognizing file extension pairs.gz pypairix.build_index(TEST_FILE_LARGE_CHR, force=1) pr2 = pypairix.open(TEST_FILE_LARGE_CHR) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
class PairixTest(unittest.TestCase): regions = read_vcf(TEST_FILE_1D) chrom = 'chr10' start = 25944 end = 27000000 result = get_result(regions, chrom, start, end) pr = pypairix.open(TEST_FILE_1D) def test_query(self): it = self.pr.query(self.chrom, self.start, self.end) pr_result = [[x[0], x[1], x[1]] for x in it] self.assertEqual(self.result, pr_result) def test_querys(self): query = '{}:{}-{}'.format(self.chrom, self.start, self.end) it = self.pr.querys(query) pr_result = [[x[0], x[1], x[1]] for x in it] self.assertEqual(self.result, pr_result) def test_build_index_with_force_vcf(self): ## recognizing file extension vcf.gz pypairix.build_index(TEST_FILE_1D, force=1) pr2 = pypairix.open(TEST_FILE_1D) query = '{}:{}-{}'.format(self.chrom, self.start, self.end) it2 = pr2.querys(query) pr2_result = [[x[0], x[1], x[1]] for x in it2] self.assertEqual(self.result, pr2_result)
def test_build_index_with_force_merged_nodups(self): ## recognizing preset merged_nodups pypairix.build_index(TEST_FILE_2D_SPACE, "merged_nodups", force=1) pr2 = pypairix.open(TEST_FILE_2D_SPACE) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
class PairixTest2DSpace(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D_SPACE, ' ') regions = read_pairs(TEST_FILE_2D_SPACE, f_type, ' ') chrom = '10' start = 1 end = 1000000 chrom2 = '20' start2 = 50000000 end2 = 60000000 result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2) pr = pypairix.open(TEST_FILE_2D_SPACE) def test_query2(self): it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_build_index_with_force_merged_nodups( self): ## recognizing preset merged_nodups pypairix.build_index(TEST_FILE_2D_SPACE, "merged_nodups", force=1) pr2 = pypairix.open(TEST_FILE_2D_SPACE) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
def test_build_index_with_force_vcf(self): ## recognizing file extension vcf.gz pypairix.build_index(TEST_FILE_1D, force=1) pr2 = pypairix.open(TEST_FILE_1D) query = '{}:{}-{}'.format(self.chrom, self.start, self.end) it2 = pr2.querys(query) pr2_result = [[x[0], x[1], x[1]] for x in it2] self.assertEqual(self.result, pr2_result)
def test_build_index_with_force_merged_nodups_with_no_preset(self): ## recognizing custom parameters pypairix.build_index(TEST_FILE_2D_SPACE, delimiter=' ', sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1) pr2 = pypairix.open(TEST_FILE_2D_SPACE) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
def cis_trans_ratio(pairs_file, outfilename, DIST_THRES=20000, cols=cols_pairs): """measure cis/trans ratio for a given pairs file""" cts = CisTransStat() tb = pypairix.open(pairs_file) chrplist = tb.get_blocknames() for chrp in chrplist: it = tb.querys2D(chrp) chr1, chr2 = chrp.split(SEPARATOR) if chr1 == chr2: for x in it: distance = get_distance_and_orientation(x, cols)[0] if distance > DIST_THRES: cts.cis += 1 else: cts.cis_short += 1 else: cts.trans += sum(1 for x in it) cts.calculate_total() cts.calculate_cis_to_trans() cts.calculate_percent_long_range_intra() # print stats with open(outfilename, 'w') as f: cts.print_stat(f)
class PairixTest2D_reverse(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D) regions = read_pairs(TEST_FILE_2D, f_type) chrom2 = '10' start2 = 1 end2 = 1000000 chrom = '20' start = 50000000 end = 60000000 # reverse reversed results to get them in the required order here result = get_result_2D(regions, chrom2, start2, end2, chrom, start, end) pr = pypairix.open(TEST_FILE_2D) def test_query2_rev(self): # 1 is included as last argument to test flipping chromosome order it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2, 1) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2_rev(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) # 1 is included as last argument to test flipping chromosome order it = self.pr.querys2D(query, 1) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_query2_rev_fail(self): # do not include 1 to test flipped order of chrs; expect this to hit a PairixWarning with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") # trigger a warning self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) # verify some things about the warning self.assertEqual(len(w), 1) self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))
def __init__(self, filepath, chromsizes, bins, map=map, n_chunks=1, is_one_based=False, **kwargs): try: import pypairix except ImportError: raise ImportError( "pypairix is required to read pairix-indexed files") import dill import pickle dill.settings['protocol'] = pickle.HIGHEST_PROTOCOL self._map = map self.n_chunks = n_chunks self.is_one_based = bool(is_one_based) f = pypairix.open(filepath, 'r') self.C1 = f.get_chr1_col() self.C2 = f.get_chr2_col() self.P1 = f.get_startpos1_col() self.P2 = f.get_startpos2_col() self.file_contigs = set( itertools.chain.from_iterable( [b.split('|') for b in f.get_blocknames()])) if not len(self.file_contigs): raise RuntimeError("No reference sequences found.") for c1, c2 in itertools.combinations(self.file_contigs, 2): if f.exists2(c1, c2) and f.exists2(c2, c1): raise RuntimeError( "Pairs are not triangular: found blocks " + "'{0}|{1}'' and '{1}|{0}'".format(c1, c2)) # dumb heuristic to prevent excessively large chunks on one worker if hasattr(f, 'get_linecount'): n_lines = f.get_linecount() if n_lines < 0: # correct int32 overflow bug MAXINT32 = 2147483647 n_lines = MAXINT32 + MAXINT32 + n_lines max_chunk = int(100e6) n_chunks = n_lines // 2 // max_chunk old_n = self.n_chunks self.n_chunks = max(self.n_chunks, n_chunks) if self.n_chunks > old_n: logger.info( "Pairs file has {} lines. Increasing max-split to {}.".format( n_lines, self.n_chunks)) # all requested contigs will be placed in the output matrix self.gs = GenomeSegmentation(chromsizes, bins) # find available contigs in the contact list self.filepath = filepath self.n_records = None # warn about requested contigs not seen in the contact list for chrom in self.gs.contigs: if chrom not in self.file_contigs: warnings.warn( "Did not find contig " + " '{}' in contact list file.".format(chrom))
def _fetch_region(filepath, chromsizes, slc, block, columns=None, usecols=None, meta=None): chrom1, chrom2 = block if chrom2 is None: chrom2 = chrom1 if slc is None: start, end = 0, chromsizes[chrom1] else: start, end = slc.start, slc.stop f = pypairix.open(filepath, 'r') it = f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2]) if usecols is not None: records = [ (record[i] for i in usecols) for record in it ] else: records = it df = pd.DataFrame.from_records(records, columns=columns) if not len(df): df = meta.copy() # elif usecols is not None: # usecols = set(usecols) # df = df[[col for col in meta.columns if col in usecols]] for col, dt in meta.dtypes.items(): df.loc[:, col] = df.loc[:, col].astype(dt) return df
def test_build_index_with_force_merged_nodups_tab(self): ## recognizing custom set pypairix.build_index(TEST_FILE_2D, sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1) # test with reindex pr2 = pypairix.open(TEST_FILE_2D) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
def test_build_index_with_region_split_character(self): pypairix.build_index(TEST_FILE_LARGE_CHR, region_split_character="^", force=1) pr2 = pypairix.open(TEST_FILE_LARGE_CHR) query = '{}:{}-{}^{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) pypairix.build_index(TEST_FILE_LARGE_CHR, force=1) # revert self.assertEqual(self.result, pr2_result)
def test_columnindex(self): pr = pypairix.open(TEST_FILE_2D) pr2 = pypairix.open(TEST_FILE_2D_4DN) self.assertEqual(pr.get_chr1_col(),1) self.assertEqual(pr.get_chr2_col(),5) self.assertEqual(pr.get_startpos1_col(),2) self.assertEqual(pr.get_startpos2_col(),6) self.assertEqual(pr.get_endpos1_col(),2) self.assertEqual(pr.get_endpos2_col(),6) self.assertEqual(pr2.get_chr1_col(),1) self.assertEqual(pr2.get_chr2_col(),3) self.assertEqual(pr2.get_startpos1_col(),2) self.assertEqual(pr2.get_startpos2_col(),4) self.assertEqual(pr2.get_endpos1_col(),2) self.assertEqual(pr2.get_endpos2_col(),4)
def test_exists2(self): pr = pypairix.open(TEST_FILE_2D_4DN) self.assertEqual(pr.exists2("chr21","chr21"),1) self.assertEqual(pr.exists2("chr21","chr22"),1) self.assertEqual(pr.exists2("chr22","chr22"),1) self.assertEqual(pr.exists2("chr22","chr21"),0) self.assertEqual(pr.exists2("chr1","chr2"),0) self.assertEqual(pr.exists2("1","2"),0)
def distance_histogram (pairs_file, chromsize_file, outfilename, cols=cols_pairs, orientation_list = orientation_list_pairs, max_logdistance=8.4, min_logdistance=1, log_binsize=0.1): """create a log10-scale binned histogram table for read separation distance histogram The histogram is stratefied by read orientation (4 different orientations) The table includes raw counts, log10 counts (pseudocounts added), contact probability, log10 contact probability, and proportions for orientation (pseudocounts added) Bin is represented by the mid value at the log10 scale. log_binsize: distance bin size in log10 scale. """ gs = GenomeSize(chromsize_file) bins = DistanceBin(min_logdistance, max_logdistance, log_binsize) ss = [] for _ in bins.range: ss.append(SeparationStat(orientation_list,gs)) tb=pypairix.open( pairs_file ) chrplist = tb.get_blocknames() # calculate histogram for chrp in chrplist: chr1, chr2 = chrp.split( SEPARATOR ) if chr1 == chr2: it = tb.querys2D( chrp ) for x in it: distance, orientation = get_distance_and_orientation (x, cols) if orientation not in orientation_list: # for some exceptional cases like '4' in merged_nodup continue # remove zero distance, count. if distance > 0: bin_number = bins.get_bin_number(distance) if bin_number <= bins.max_bin_number: ss[bin_number].increment(orientation, chr1) # calculate total for bin_number in bins.range: ss[bin_number].calculate_sumcount() # calculate histogram in log10 counts and proportion for bin_number in bins.range: ss[bin_number].calculate_log10count_per_ori() ss[bin_number].calculate_log10sumcount() ss[bin_number].calculate_pcount_per_ori() # calculate contact probability for bin_number in bins.range: bin_mid = bins.get_bin_mid(bin_number) bin_size = bins.get_bin_size(bin_mid) ss[bin_number].calculate_contact_probability_per_chr(bin_mid, bin_size) ss[bin_number].calculate_contact_probability(bin_mid, bin_size) # print histogram with open(outfilename,'w') as f: ss[0].print_header(f) for bin_number in bins.range: bin_mid = bins.get_bin_mid(bin_number) if bin_mid <= bins.max_logdistance and bin_mid >= bins.min_logdistance: ss[bin_number].print_content(f, bin_mid, bins.get_bin_range_string(bin_mid))
def test_bgzf_block_count(self): pr = pypairix.open(TEST_FILE_2D_4DN) self.assertEqual(pr.bgzf_block_count("chr21","chr21"),8) self.assertEqual(pr.bgzf_block_count("chr21","chr22"),1) self.assertEqual(pr.bgzf_block_count("chr22","chr22"),12) self.assertEqual(pr.bgzf_block_count("chr22","chr21"),0) self.assertEqual(pr.bgzf_block_count("chr21","chrY"),0) self.assertEqual(pr.bgzf_block_count("chr1","chr2"),0) self.assertEqual(pr.bgzf_block_count("1","2"),0)
def test_exists(self): pr = pypairix.open(TEST_FILE_2D_4DN) self.assertEqual(pr.exists("chr21|chr21"),1) self.assertEqual(pr.exists("chr21|chr22"),1) self.assertEqual(pr.exists("chr22|chr22"),1) self.assertEqual(pr.exists("chr22|chr21"),0) self.assertEqual(pr.exists("chr1|chr2"),0) self.assertEqual(pr.exists("chr21"),0) self.assertEqual(pr.exists("1|2"),0)
def aggregate(self, chrom1): import pypairix f = pypairix.open(self.filepath, 'r') cid1 = self.gs.idmap[chrom1] chromsizes = self.gs.chromsizes these_bins = self.gs.fetch(chrom1) remaining_chroms = self.gs.idmap[chrom1:] c1 = self.field_numbers['chrom1'] c2 = self.field_numbers['chrom2'] s1 = self.field_numbers['start1'] s2 = self.field_numbers['start2'] e1 = self.field_numbers['end1'] e2 = self.field_numbers['end2'] # read contact matrix one row at a time logger.info(chrom1) lines = [] for bin1_id, bin1 in these_bins.iterrows(): for chrom2, cid2 in six.iteritems(remaining_chroms): chrom2_size = chromsizes[chrom2] if chrom1 != chrom2 and f.exists2(chrom2, chrom1): # flipped block q = [] for line in f.query2D(chrom2, 0, chrom2_size, chrom1, bin1.start, bin1.end): line[c1], line[c2] = line[c2], line[c1] line[s1], line[s2] = line[s2], line[s1] line[e1], line[e2] = line[e2], line[e1] q.append(line) else: q = list(line for line in f.query2D( chrom1, bin1.start, bin1.end, chrom2, 0, chrom2_size)) lines.extend(q) if not lines: return None df = pandas.DataFrame(lines) df = df[self.usecols] df.columns = self.columns for col, dtype in self.dtypes.items(): df[col] = df[col].astype(dtype) # assign bin IDs from bin table df = (df.merge(self.bins, left_on=['chrom1', 'start1', 'end1'], right_on=['chrom', 'start', 'end']).merge( self.bins, left_on=['chrom2', 'start2', 'end2'], right_on=['chrom', 'start', 'end'], suffixes=('1', '2')).rename(columns={ 'bin1': 'bin1_id', 'bin2': 'bin2_id' })) df = (df[self.out_columns].sort_values(['bin1_id', 'bin2_id'])) return df
class PairixTest2D(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D) regions = read_pairs(TEST_FILE_2D, f_type) chrom = '10' start = 1 end = 1000000 chrom2 = '20' start2 = 50000000 end2 = 60000000 result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2) pr = pypairix.open(TEST_FILE_2D) def test_query2(self): it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2_bad_order(self): # build the query with coordinates in the wrong order query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.end, self.start, self.chrom2, self.start2, self.end2) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") # trigger a warning it = self.pr.querys2D(query) # verify some things about the warning assert len(w) == 1 assert issubclass(w[-1].category, pypairix.PairixWarning) def test_build_index_with_force_merged_nodups_tab( self): ## recognizing custom set pypairix.build_index(TEST_FILE_2D, sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1) # test with reindex pr2 = pypairix.open(TEST_FILE_2D) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
def read_pairix(fp, region1, region2=None, chromsizes=None, columns=None, usecols=None, dtypes=None, **kwargs): """ Read a pairix-indexed file into DataFrame. """ import pypairix import cytoolz as toolz if dtypes is None: dtypes = {} f = pypairix.open(fp, "r") header = f.get_header() if len(header): header_groups = toolz.groupby(lambda x: x.split(":")[0], header) if "#chromsize" in header_groups and chromsizes is None: items = [line.split()[1:] for line in header_groups["#chromsize"]] if len(items) and chromsizes is None: names, lengths = zip(*((item[0], int(item[1])) for item in items)) chromsizes = pd.Series(index=names, data=lengths) if "#columns" in header_groups and columns is None: columns = header_groups["#columns"][0].split()[1:] chrom1, start1, end1 = parse_region(region1, chromsizes) if region2 is not None: chrom2, start2, end2 = parse_region(region2, chromsizes) else: chrom2, start2, end2 = chrom1, start1, end1 it = f.query2D(chrom1, start1, end1, chrom2, start2, end2) if usecols is not None: argusecols = [columns.index(col) for col in usecols] records = [(record[i] for i in argusecols) for record in it] columns = usecols else: records = it df = pd.DataFrame.from_records(records, columns=columns) if columns is not None: for col in columns: if col in dtypes: df[col] = df[col].astype(dtypes[col]) else: df[col] = pd.to_numeric(df[col], "ignore") return df
def read_pairix_block(filepath, block, names=None, dtypes=None, usecols=None, chromsizes=None, chunk_level=0): if chromsizes is None: f = pypairix.open(filepath) cs = f.get_chromsize() if not len(cs): raise ValueError("No chromsize headers found in file. " "They must be provided explicitly.") chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs])) del f chrom1, chrom2 = block nrows = chromsizes[chrom1] meta = pd.read_csv(filepath, sep='\t', comment='#', header=None, names=names, dtype=dtypes, usecols=usecols, iterator=True).read(1024).iloc[0:0] # Make a unique task name token = tokenize(filepath, chromsizes, block, names, dtypes, usecols, chunk_level) task_name = 'read-pairix-block-' + token # Build the task graph divisions = [] dsk = {} edges = LEVEL[chunk_level] edges = edges[:np.searchsorted(edges, nrows)] if edges[-1] != nrows: edges = np.r_[edges, nrows] spans = zip(edges[:-1], edges[1:]) for i, (lo, hi) in enumerate(spans): if i == 0: divisions.append(lo) divisions.append(hi - 1) slc = slice(lo, hi) dsk[task_name, i] = (_fetch_region, filepath, chromsizes, slc, block, names, usecols, meta) # Generate ddf from dask graph return dd.DataFrame(dsk, task_name, meta, tuple(divisions))
class PairixTest2D_LargeChr(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_LARGE_CHR) regions = read_pairs(TEST_FILE_LARGE_CHR, f_type) chrom = 'chr21' start = 1 end = 1073741824 chrom2 = 'chr22' start2 = 1 end2 = 1073741824 # reverse reversed results to get them in the required order here result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2) pr = pypairix.open(TEST_FILE_LARGE_CHR) def test_query2_4dn(self): it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_querys_2_4dn(self): query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result) def test_build_index_without_force(self): # expect an error here... the px2 file already exists with self.assertRaises(pypairix.PairixError) as error: pypairix.build_index(TEST_FILE_LARGE_CHR) # errors are handled differently in python 2 and python 3 if sys.version_info > (3,0): self.assertEqual(error.exception.__str__(), "The index file exists. Please use force=1 to overwrite.") else: self.assertEqual(error.exception.message, "The index file exists. Please use force=1 to overwrite.") def test_build_index_with_region_split_character(self): pypairix.build_index(TEST_FILE_LARGE_CHR, region_split_character="^", force=1) pr2 = pypairix.open(TEST_FILE_LARGE_CHR) query = '{}:{}-{}^{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) pypairix.build_index(TEST_FILE_LARGE_CHR, force=1) # revert self.assertEqual(self.result, pr2_result) def test_build_index_with_force(self): ## recognizing file extension pairs.gz pypairix.build_index(TEST_FILE_LARGE_CHR, force=1) pr2 = pypairix.open(TEST_FILE_LARGE_CHR) query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2) it2 = pr2.querys2D(query) pr2_result = build_it_result(it2, self.f_type) self.assertEqual(self.result, pr2_result)
def read_pairix(filepath, names, blocks=None, chromsizes=None, **kwargs): """ Read a Pairix-indexed BEDPE-like file as a dask dataframe. Parameters ---------- filepath : str Path to the pairs or paired-end interval file, not the index file. (i.e. omit the .px2 extension). names : sequence of str Names for the columns in the pairs file. blocks : sequence of str or tuple List of paired chromosome blocks to load. If a list of single chromosome names is given, then all pair permutations are loaded. chromsizes : dict or Series, optional Chromosome lengths to use if chromsizes headers are not available. chunk_level : {0, 1, 2, 3, 4} Increase for a finer partition. Returns ------- OrderedDict A mapping of chromosome pairs to dask dataframes. """ f = pypairix.open(filepath) if chromsizes is None: cs = f.get_chromsize() if not len(cs): raise ValueError( "No chromsize headers found in file. " "They must be provided explicitly." ) chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs])) if blocks is None: blocks = [s.split("|") for s in f.get_blocknames()] elif isinstance(blocks[0], str): blocks = [(ci, cj) for ci in blocks for cj in blocks] dct = OrderedDict() for chrom1, chrom2 in blocks: if chrom1 in chromsizes and chrom2 in chromsizes: dct[chrom1, chrom2] = read_pairix_block( filepath, (chrom1, chrom2), names, chromsizes=chromsizes, **kwargs ) return dct
def __init__(self, filepath, chromsizes, bins, field_numbers=None, field_dtypes=None): try: import pypairix except ImportError: raise ImportError( "pypairix is required to read pairix-indexed files") self._map = map self.filepath = filepath f = pypairix.open(self.filepath, 'r') self.file_contigs = set( itertools.chain.from_iterable( [b.split('|') for b in f.get_blocknames()])) # all requested contigs will be placed in the output matrix self.gs = GenomeSegmentation(chromsizes, bins) self.bins = self.gs.bins.copy() self.bins['chrom'] = self.bins['chrom'].astype(object) self.bins['bin'] = self.bins.index # warn about requested contigs not seen in the contact list for chrom in self.gs.contigs: if chrom not in self.file_contigs: warnings.warn("Did not find contig " + " '{}' in bg2 file.".format(chrom)) # Assign the column numbers self.field_numbers = self.FIELD_NUMBERS.copy() if field_numbers is not None: self.field_numbers.update(field_numbers) self.columns = list(self.field_numbers.keys()) self.usecols = list(self.field_numbers.values()) # Assign the column dtypes. Assume additional value fields are float. self.out_columns = ['bin1_id', 'bin2_id', 'count'] self.dtypes = self.FIELD_DTYPES.copy() for col in self.columns: if col not in self.dtypes: self.out_columns.append(col) self.dtypes[col] = float # Override defaults if field_dtypes is not None: self.dtypes.update(field_dtypes)
class PairixTest_2(unittest.TestCase): f_type = find_pairs_type(TEST_FILE_2D) regions = read_pairs(TEST_FILE_2D, f_type) chrom = '10' start = 25944 end = 27000000 chrom2 = '20' result = get_result_2D(regions, chrom, start, end, chrom2, 0, sys.maxsize) pr = pypairix.open(TEST_FILE_2D) def test_querys(self): query = '{}:{}-{}|{}'.format(self.chrom, self.start, self.end, self.chrom2) it = self.pr.querys2D(query) pr_result = build_it_result(it, self.f_type) self.assertEqual(self.result, pr_result)
def test_blocknames(self): # block list obtained from get_blocknames() pr = pypairix.open(TEST_FILE_2D) retrieved_blocklist = pr.get_blocknames() retrieved_blocklist.sort() # true block list blocklist=[] f_type = find_pairs_type(TEST_FILE_2D) regions = read_pairs(TEST_FILE_2D, f_type) for a in regions: blocklist.append(a[0] + '|' + a[3]) blocklist_uniq = list(set(blocklist)) blocklist_uniq.sort() self.assertEqual(retrieved_blocklist, blocklist_uniq)