Ejemplo n.º 1
0
 def _size(self, block):
     import pypairix
     f = pypairix.open(self.filepath, 'r')
     chrom1, chrom2 = block
     return sum(
         1 for line in f.query2D(chrom1, 0, self.gs.chromsizes[chrom1],
                                 chrom2, 0, self.gs.chromsizes[chrom2], 1))
Ejemplo n.º 2
0
    def __init__(self, filepath, chromsizes, bins, map=map, **kwargs):
        try:
            import pypairix
        except ImportError:
            raise ImportError(
                "pypairix is required to read pairix-indexed files")

        self._map = map
        f = pypairix.open(filepath, 'r')
        self.C1 = f.get_chr1_col()
        self.C2 = f.get_chr2_col()
        self.P1 = f.get_startpos1_col()
        self.P2 = f.get_startpos2_col()
        self.file_contigs = set(
            itertools.chain.from_iterable(
                [b.split('|') for b in f.get_blocknames()]))

        # all requested contigs will be placed in the output matrix
        self.gs = GenomeSegmentation(chromsizes, bins)

        # find available contigs in the contact list
        self.filepath = filepath
        self.n_records = None

        # warn about requested contigs not seen in the contact list
        for chrom in self.gs.contigs:
            if chrom not in self.file_contigs:
                warnings.warn("Did not find contig " +
                              " '{}' in contact list file.".format(chrom))
Ejemplo n.º 3
0
class PairixTest2D_4DN(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D_4DN)
    regions = read_pairs(TEST_FILE_2D_4DN, f_type)
    chrom = 'chr21'
    start = 1
    end = 48129895
    chrom2 = 'chr22'
    start2 = 1
    end2 = 51304566
    # reverse reversed results to get them in the required order here
    result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
    pr = pypairix.open(TEST_FILE_2D_4DN)

    def test_query2_4dn(self):
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2,
                             self.start2, self.end2)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2_4dn(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end,
                                           self.chrom2, self.start2, self.end2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)
Ejemplo n.º 4
0
class PairixTest2D(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D)
    regions = read_pairs(TEST_FILE_2D, f_type)
    chrom = '10'
    start = 1
    end = 1000000
    chrom2 = '20'
    start2 = 50000000
    end2 = 60000000
    result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
    pr = pypairix.open(TEST_FILE_2D)

    def test_query2(self):
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2_bad_order(self):
        # build the query with coordinates in the wrong order
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.end, self.start, self.chrom2, self.start2, self.end2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            # trigger a warning
            self.pr.querys2D(query)
            # verify some things about the warning
            self.assertEqual(len(w), 1)
            self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))
Ejemplo n.º 5
0
 def test_build_index_with_force(self):   ## recognizing file extension pairs.gz
     pypairix.build_index(TEST_FILE_LARGE_CHR, force=1)
     pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
     query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
     it2 = pr2.querys2D(query)
     pr2_result = build_it_result(it2, self.f_type)
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 6
0
class PairixTest(unittest.TestCase):
    regions = read_vcf(TEST_FILE_1D)
    chrom = 'chr10'
    start = 25944
    end = 27000000
    result = get_result(regions, chrom, start, end)
    pr = pypairix.open(TEST_FILE_1D)

    def test_query(self):
        it = self.pr.query(self.chrom, self.start, self.end)
        pr_result = [[x[0], x[1], x[1]] for x in it]
        self.assertEqual(self.result, pr_result)

    def test_querys(self):
        query = '{}:{}-{}'.format(self.chrom, self.start, self.end)
        it = self.pr.querys(query)
        pr_result = [[x[0], x[1], x[1]] for x in it]
        self.assertEqual(self.result, pr_result)

    def test_build_index_with_force_vcf(self):  ## recognizing file extension vcf.gz
        pypairix.build_index(TEST_FILE_1D, force=1)
        pr2 = pypairix.open(TEST_FILE_1D)
        query = '{}:{}-{}'.format(self.chrom, self.start, self.end)
        it2 = pr2.querys(query)
        pr2_result = [[x[0], x[1], x[1]] for x in it2]
        self.assertEqual(self.result, pr2_result)
Ejemplo n.º 7
0
 def test_build_index_with_force_merged_nodups(self):  ## recognizing preset merged_nodups
     pypairix.build_index(TEST_FILE_2D_SPACE, "merged_nodups", force=1)
     pr2 = pypairix.open(TEST_FILE_2D_SPACE)
     query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
     it2 = pr2.querys2D(query)
     pr2_result = build_it_result(it2, self.f_type)
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 8
0
class PairixTest2DSpace(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D_SPACE, ' ')
    regions = read_pairs(TEST_FILE_2D_SPACE, f_type, ' ')
    chrom = '10'
    start = 1
    end = 1000000
    chrom2 = '20'
    start2 = 50000000
    end2 = 60000000
    result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
    pr = pypairix.open(TEST_FILE_2D_SPACE)

    def test_query2(self):
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2,
                             self.start2, self.end2)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end,
                                           self.chrom2, self.start2, self.end2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_build_index_with_force_merged_nodups(
            self):  ## recognizing preset merged_nodups
        pypairix.build_index(TEST_FILE_2D_SPACE, "merged_nodups", force=1)
        pr2 = pypairix.open(TEST_FILE_2D_SPACE)
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end,
                                           self.chrom2, self.start2, self.end2)
        it2 = pr2.querys2D(query)
        pr2_result = build_it_result(it2, self.f_type)
        self.assertEqual(self.result, pr2_result)
Ejemplo n.º 9
0
 def test_build_index_with_force_vcf(self):  ## recognizing file extension vcf.gz
     pypairix.build_index(TEST_FILE_1D, force=1)
     pr2 = pypairix.open(TEST_FILE_1D)
     query = '{}:{}-{}'.format(self.chrom, self.start, self.end)
     it2 = pr2.querys(query)
     pr2_result = [[x[0], x[1], x[1]] for x in it2]
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 10
0
 def test_build_index_with_force_merged_nodups_with_no_preset(self):  ## recognizing custom parameters
     pypairix.build_index(TEST_FILE_2D_SPACE, delimiter=' ', sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1)
     pr2 = pypairix.open(TEST_FILE_2D_SPACE)
     query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
     it2 = pr2.querys2D(query)
     pr2_result = build_it_result(it2, self.f_type)
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 11
0
def cis_trans_ratio(pairs_file,
                    outfilename,
                    DIST_THRES=20000,
                    cols=cols_pairs):
    """measure cis/trans ratio for a given pairs file"""

    cts = CisTransStat()

    tb = pypairix.open(pairs_file)
    chrplist = tb.get_blocknames()
    for chrp in chrplist:
        it = tb.querys2D(chrp)
        chr1, chr2 = chrp.split(SEPARATOR)
        if chr1 == chr2:
            for x in it:
                distance = get_distance_and_orientation(x, cols)[0]
                if distance > DIST_THRES:
                    cts.cis += 1
                else:
                    cts.cis_short += 1
        else:
            cts.trans += sum(1 for x in it)
    cts.calculate_total()
    cts.calculate_cis_to_trans()
    cts.calculate_percent_long_range_intra()

    # print stats
    with open(outfilename, 'w') as f:
        cts.print_stat(f)
Ejemplo n.º 12
0
class PairixTest2D_reverse(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D)
    regions = read_pairs(TEST_FILE_2D, f_type)
    chrom2 = '10'
    start2 = 1
    end2 = 1000000
    chrom = '20'
    start = 50000000
    end = 60000000
    # reverse reversed results to get them in the required order here
    result = get_result_2D(regions, chrom2, start2, end2, chrom, start, end)
    pr = pypairix.open(TEST_FILE_2D)

    def test_query2_rev(self):
        # 1 is included as last argument to test flipping chromosome order
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2, 1)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2_rev(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        # 1 is included as last argument to test flipping chromosome order
        it = self.pr.querys2D(query, 1)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_query2_rev_fail(self):
        # do not include 1 to test flipped order of chrs; expect this to hit a PairixWarning
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            # trigger a warning
            self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
            # verify some things about the warning
            self.assertEqual(len(w), 1)
            self.assertTrue(issubclass(w[-1].category, pypairix.PairixWarning))
Ejemplo n.º 13
0
    def __init__(self, filepath, chromsizes, bins, map=map, n_chunks=1, is_one_based=False, **kwargs):
        try:
            import pypairix
        except ImportError:
            raise ImportError(
                "pypairix is required to read pairix-indexed files")

        import dill
        import pickle
        dill.settings['protocol'] = pickle.HIGHEST_PROTOCOL

        self._map = map
        self.n_chunks = n_chunks
        self.is_one_based = bool(is_one_based)
        f = pypairix.open(filepath, 'r')
        self.C1 = f.get_chr1_col()
        self.C2 = f.get_chr2_col()
        self.P1 = f.get_startpos1_col()
        self.P2 = f.get_startpos2_col()
        self.file_contigs = set(
            itertools.chain.from_iterable(
                [b.split('|') for b in f.get_blocknames()]))

        if not len(self.file_contigs):
            raise RuntimeError("No reference sequences found.")
        for c1, c2 in itertools.combinations(self.file_contigs, 2):
            if f.exists2(c1, c2) and f.exists2(c2, c1):
                raise RuntimeError(
                    "Pairs are not triangular: found blocks " +
                    "'{0}|{1}'' and '{1}|{0}'".format(c1, c2))

        # dumb heuristic to prevent excessively large chunks on one worker
        if hasattr(f, 'get_linecount'):
            n_lines = f.get_linecount()
            if n_lines < 0:
                # correct int32 overflow bug
                MAXINT32 = 2147483647
                n_lines = MAXINT32 + MAXINT32 + n_lines
            max_chunk = int(100e6)
            n_chunks = n_lines // 2 // max_chunk
            old_n = self.n_chunks
            self.n_chunks = max(self.n_chunks, n_chunks)
            if self.n_chunks > old_n:
                logger.info(
                    "Pairs file has {} lines. Increasing max-split to {}.".format(
                    n_lines, self.n_chunks))

        # all requested contigs will be placed in the output matrix
        self.gs = GenomeSegmentation(chromsizes, bins)

        # find available contigs in the contact list
        self.filepath = filepath
        self.n_records = None

        # warn about requested contigs not seen in the contact list
        for chrom in self.gs.contigs:
            if chrom not in self.file_contigs:
                warnings.warn(
                    "Did not find contig " +
                    " '{}' in contact list file.".format(chrom))
Ejemplo n.º 14
0
def _fetch_region(filepath, chromsizes, slc, block, columns=None,
                  usecols=None, meta=None):
    chrom1, chrom2 = block
    if chrom2 is None:
        chrom2 = chrom1
    if slc is None:
        start, end = 0, chromsizes[chrom1]
    else:
        start, end = slc.start, slc.stop

    f = pypairix.open(filepath, 'r')
    it = f.query2D(chrom1, start, end, chrom2, 0, chromsizes[chrom2])
    if usecols is not None:
        records = [
            (record[i] for i in usecols) for record in it
        ]
    else:
        records = it

    df = pd.DataFrame.from_records(records, columns=columns)
    if not len(df):
        df = meta.copy()
    # elif usecols is not None:
    #     usecols = set(usecols)
    #     df = df[[col for col in meta.columns if col in usecols]]

    for col, dt in meta.dtypes.items():
        df.loc[:, col] = df.loc[:, col].astype(dt)

    return df
Ejemplo n.º 15
0
 def test_build_index_with_force_merged_nodups_tab(self):  ## recognizing custom set
     pypairix.build_index(TEST_FILE_2D, sc=2, bc=3, ec=3, sc2=6, bc2=7, ec2=7, force=1)
     # test with reindex
     pr2 = pypairix.open(TEST_FILE_2D)
     query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
     it2 = pr2.querys2D(query)
     pr2_result = build_it_result(it2, self.f_type)
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 16
0
 def test_build_index_with_region_split_character(self):
     pypairix.build_index(TEST_FILE_LARGE_CHR, region_split_character="^", force=1)
     pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
     query = '{}:{}-{}^{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
     it2 = pr2.querys2D(query)
     pr2_result = build_it_result(it2, self.f_type)
     pypairix.build_index(TEST_FILE_LARGE_CHR, force=1)  # revert
     self.assertEqual(self.result, pr2_result)
Ejemplo n.º 17
0
    def test_columnindex(self):
        pr = pypairix.open(TEST_FILE_2D)
        pr2 = pypairix.open(TEST_FILE_2D_4DN)

        self.assertEqual(pr.get_chr1_col(),1)
        self.assertEqual(pr.get_chr2_col(),5)
        self.assertEqual(pr.get_startpos1_col(),2)
        self.assertEqual(pr.get_startpos2_col(),6)
        self.assertEqual(pr.get_endpos1_col(),2)
        self.assertEqual(pr.get_endpos2_col(),6)

        self.assertEqual(pr2.get_chr1_col(),1)
        self.assertEqual(pr2.get_chr2_col(),3)
        self.assertEqual(pr2.get_startpos1_col(),2)
        self.assertEqual(pr2.get_startpos2_col(),4)
        self.assertEqual(pr2.get_endpos1_col(),2)
        self.assertEqual(pr2.get_endpos2_col(),4)
Ejemplo n.º 18
0
 def test_exists2(self):
     pr = pypairix.open(TEST_FILE_2D_4DN)
     self.assertEqual(pr.exists2("chr21","chr21"),1)
     self.assertEqual(pr.exists2("chr21","chr22"),1)
     self.assertEqual(pr.exists2("chr22","chr22"),1)
     self.assertEqual(pr.exists2("chr22","chr21"),0)
     self.assertEqual(pr.exists2("chr1","chr2"),0)
     self.assertEqual(pr.exists2("1","2"),0)
Ejemplo n.º 19
0
def distance_histogram (pairs_file, chromsize_file, outfilename, cols=cols_pairs, orientation_list = orientation_list_pairs, max_logdistance=8.4, min_logdistance=1, log_binsize=0.1):
    """create a log10-scale binned histogram table for read separation distance histogram
    The histogram is stratefied by read orientation (4 different orientations)
    The table includes raw counts, log10 counts (pseudocounts added), contact probability, log10 contact probability, and proportions for orientation (pseudocounts added)
    Bin is represented by the mid value at the log10 scale.
    log_binsize: distance bin size in log10 scale.
    """
    gs = GenomeSize(chromsize_file)
    bins = DistanceBin(min_logdistance, max_logdistance, log_binsize)

    ss = []
    for _ in bins.range:
        ss.append(SeparationStat(orientation_list,gs))

    tb=pypairix.open( pairs_file )
    chrplist = tb.get_blocknames()

    # calculate histogram
    for chrp in chrplist:
        chr1, chr2 = chrp.split( SEPARATOR )
        if chr1 == chr2:
            it = tb.querys2D( chrp )
            for x in it:
                distance, orientation = get_distance_and_orientation (x, cols)
                if orientation not in orientation_list: # for some exceptional cases like '4' in merged_nodup
                    continue
 
                # remove zero distance, count.
                if distance > 0:
                    bin_number = bins.get_bin_number(distance)
                    if bin_number <= bins.max_bin_number:
                        ss[bin_number].increment(orientation, chr1)

    # calculate total
    for bin_number in bins.range:
        ss[bin_number].calculate_sumcount()

    # calculate histogram in log10 counts and proportion
    for bin_number in bins.range:
        ss[bin_number].calculate_log10count_per_ori()
        ss[bin_number].calculate_log10sumcount()
        ss[bin_number].calculate_pcount_per_ori()

    # calculate contact probability
    for bin_number in bins.range:
        bin_mid = bins.get_bin_mid(bin_number)
        bin_size = bins.get_bin_size(bin_mid)
        ss[bin_number].calculate_contact_probability_per_chr(bin_mid, bin_size)
        ss[bin_number].calculate_contact_probability(bin_mid, bin_size)

    # print histogram
    with open(outfilename,'w') as f:
        ss[0].print_header(f)
        for bin_number in bins.range:
            bin_mid = bins.get_bin_mid(bin_number)
            if bin_mid <= bins.max_logdistance and bin_mid >= bins.min_logdistance:
                ss[bin_number].print_content(f, bin_mid, bins.get_bin_range_string(bin_mid))
Ejemplo n.º 20
0
 def test_bgzf_block_count(self):
     pr = pypairix.open(TEST_FILE_2D_4DN)
     self.assertEqual(pr.bgzf_block_count("chr21","chr21"),8)
     self.assertEqual(pr.bgzf_block_count("chr21","chr22"),1)
     self.assertEqual(pr.bgzf_block_count("chr22","chr22"),12)
     self.assertEqual(pr.bgzf_block_count("chr22","chr21"),0)
     self.assertEqual(pr.bgzf_block_count("chr21","chrY"),0)
     self.assertEqual(pr.bgzf_block_count("chr1","chr2"),0)
     self.assertEqual(pr.bgzf_block_count("1","2"),0)
Ejemplo n.º 21
0
 def test_exists(self):
     pr = pypairix.open(TEST_FILE_2D_4DN)
     self.assertEqual(pr.exists("chr21|chr21"),1)
     self.assertEqual(pr.exists("chr21|chr22"),1)
     self.assertEqual(pr.exists("chr22|chr22"),1)
     self.assertEqual(pr.exists("chr22|chr21"),0)
     self.assertEqual(pr.exists("chr1|chr2"),0)
     self.assertEqual(pr.exists("chr21"),0)
     self.assertEqual(pr.exists("1|2"),0)
Ejemplo n.º 22
0
    def aggregate(self, chrom1):
        import pypairix

        f = pypairix.open(self.filepath, 'r')
        cid1 = self.gs.idmap[chrom1]
        chromsizes = self.gs.chromsizes
        these_bins = self.gs.fetch(chrom1)
        remaining_chroms = self.gs.idmap[chrom1:]
        c1 = self.field_numbers['chrom1']
        c2 = self.field_numbers['chrom2']
        s1 = self.field_numbers['start1']
        s2 = self.field_numbers['start2']
        e1 = self.field_numbers['end1']
        e2 = self.field_numbers['end2']

        # read contact matrix one row at a time
        logger.info(chrom1)
        lines = []
        for bin1_id, bin1 in these_bins.iterrows():
            for chrom2, cid2 in six.iteritems(remaining_chroms):
                chrom2_size = chromsizes[chrom2]
                if chrom1 != chrom2 and f.exists2(chrom2,
                                                  chrom1):  # flipped block
                    q = []
                    for line in f.query2D(chrom2, 0, chrom2_size, chrom1,
                                          bin1.start, bin1.end):
                        line[c1], line[c2] = line[c2], line[c1]
                        line[s1], line[s2] = line[s2], line[s1]
                        line[e1], line[e2] = line[e2], line[e1]
                        q.append(line)
                else:
                    q = list(line for line in f.query2D(
                        chrom1, bin1.start, bin1.end, chrom2, 0, chrom2_size))
                lines.extend(q)
        if not lines:
            return None

        df = pandas.DataFrame(lines)
        df = df[self.usecols]
        df.columns = self.columns
        for col, dtype in self.dtypes.items():
            df[col] = df[col].astype(dtype)

        # assign bin IDs from bin table
        df = (df.merge(self.bins,
                       left_on=['chrom1', 'start1', 'end1'],
                       right_on=['chrom', 'start', 'end']).merge(
                           self.bins,
                           left_on=['chrom2', 'start2', 'end2'],
                           right_on=['chrom', 'start', 'end'],
                           suffixes=('1', '2')).rename(columns={
                               'bin1': 'bin1_id',
                               'bin2': 'bin2_id'
                           }))
        df = (df[self.out_columns].sort_values(['bin1_id', 'bin2_id']))
        return df
Ejemplo n.º 23
0
class PairixTest2D(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D)
    regions = read_pairs(TEST_FILE_2D, f_type)
    chrom = '10'
    start = 1
    end = 1000000
    chrom2 = '20'
    start2 = 50000000
    end2 = 60000000
    result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
    pr = pypairix.open(TEST_FILE_2D)

    def test_query2(self):
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2,
                             self.start2, self.end2)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end,
                                           self.chrom2, self.start2, self.end2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2_bad_order(self):
        # build the query with coordinates in the wrong order
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.end, self.start,
                                           self.chrom2, self.start2, self.end2)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            # trigger a warning
            it = self.pr.querys2D(query)
            # verify some things about the warning
            assert len(w) == 1
            assert issubclass(w[-1].category, pypairix.PairixWarning)

    def test_build_index_with_force_merged_nodups_tab(
            self):  ## recognizing custom set
        pypairix.build_index(TEST_FILE_2D,
                             sc=2,
                             bc=3,
                             ec=3,
                             sc2=6,
                             bc2=7,
                             ec2=7,
                             force=1)
        # test with reindex
        pr2 = pypairix.open(TEST_FILE_2D)
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end,
                                           self.chrom2, self.start2, self.end2)
        it2 = pr2.querys2D(query)
        pr2_result = build_it_result(it2, self.f_type)
        self.assertEqual(self.result, pr2_result)
Ejemplo n.º 24
0
def read_pairix(fp,
                region1,
                region2=None,
                chromsizes=None,
                columns=None,
                usecols=None,
                dtypes=None,
                **kwargs):
    """
    Read a pairix-indexed file into DataFrame.
    """
    import pypairix
    import cytoolz as toolz

    if dtypes is None:
        dtypes = {}
    f = pypairix.open(fp, "r")

    header = f.get_header()
    if len(header):
        header_groups = toolz.groupby(lambda x: x.split(":")[0], header)
        if "#chromsize" in header_groups and chromsizes is None:
            items = [line.split()[1:] for line in header_groups["#chromsize"]]
            if len(items) and chromsizes is None:
                names, lengths = zip(*((item[0], int(item[1]))
                                       for item in items))
                chromsizes = pd.Series(index=names, data=lengths)
        if "#columns" in header_groups and columns is None:
            columns = header_groups["#columns"][0].split()[1:]

    chrom1, start1, end1 = parse_region(region1, chromsizes)
    if region2 is not None:
        chrom2, start2, end2 = parse_region(region2, chromsizes)
    else:
        chrom2, start2, end2 = chrom1, start1, end1

    it = f.query2D(chrom1, start1, end1, chrom2, start2, end2)
    if usecols is not None:
        argusecols = [columns.index(col) for col in usecols]
        records = [(record[i] for i in argusecols) for record in it]
        columns = usecols
    else:
        records = it

    df = pd.DataFrame.from_records(records, columns=columns)
    if columns is not None:
        for col in columns:
            if col in dtypes:
                df[col] = df[col].astype(dtypes[col])
            else:
                df[col] = pd.to_numeric(df[col], "ignore")
    return df
Ejemplo n.º 25
0
def read_pairix_block(filepath,
                      block,
                      names=None,
                      dtypes=None,
                      usecols=None,
                      chromsizes=None,
                      chunk_level=0):
    if chromsizes is None:
        f = pypairix.open(filepath)
        cs = f.get_chromsize()
        if not len(cs):
            raise ValueError("No chromsize headers found in file. "
                             "They must be provided explicitly.")
        chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs]))
        del f

    chrom1, chrom2 = block
    nrows = chromsizes[chrom1]

    meta = pd.read_csv(filepath,
                       sep='\t',
                       comment='#',
                       header=None,
                       names=names,
                       dtype=dtypes,
                       usecols=usecols,
                       iterator=True).read(1024).iloc[0:0]

    # Make a unique task name
    token = tokenize(filepath, chromsizes, block, names, dtypes, usecols,
                     chunk_level)
    task_name = 'read-pairix-block-' + token

    # Build the task graph
    divisions = []
    dsk = {}
    edges = LEVEL[chunk_level]
    edges = edges[:np.searchsorted(edges, nrows)]
    if edges[-1] != nrows:
        edges = np.r_[edges, nrows]
    spans = zip(edges[:-1], edges[1:])
    for i, (lo, hi) in enumerate(spans):
        if i == 0:
            divisions.append(lo)
        divisions.append(hi - 1)
        slc = slice(lo, hi)
        dsk[task_name, i] = (_fetch_region, filepath, chromsizes, slc, block,
                             names, usecols, meta)

    # Generate ddf from dask graph
    return dd.DataFrame(dsk, task_name, meta, tuple(divisions))
Ejemplo n.º 26
0
class PairixTest2D_LargeChr(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_LARGE_CHR)
    regions = read_pairs(TEST_FILE_LARGE_CHR, f_type)
    chrom = 'chr21'
    start = 1
    end = 1073741824
    chrom2 = 'chr22'
    start2 = 1
    end2 = 1073741824
    # reverse reversed results to get them in the required order here
    result = get_result_2D(regions, chrom, start, end, chrom2, start2, end2)
    pr = pypairix.open(TEST_FILE_LARGE_CHR)

    def test_query2_4dn(self):
        it = self.pr.query2D(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_querys_2_4dn(self):
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)

    def test_build_index_without_force(self):
        # expect an error here... the px2 file already exists
        with self.assertRaises(pypairix.PairixError) as error:
            pypairix.build_index(TEST_FILE_LARGE_CHR)
        # errors are handled differently in python 2 and python 3
        if sys.version_info > (3,0):
            self.assertEqual(error.exception.__str__(), "The index file exists. Please use force=1 to overwrite.")
        else:
            self.assertEqual(error.exception.message, "The index file exists. Please use force=1 to overwrite.")

    def test_build_index_with_region_split_character(self):
        pypairix.build_index(TEST_FILE_LARGE_CHR, region_split_character="^", force=1)
        pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
        query = '{}:{}-{}^{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        it2 = pr2.querys2D(query)
        pr2_result = build_it_result(it2, self.f_type)
        pypairix.build_index(TEST_FILE_LARGE_CHR, force=1)  # revert
        self.assertEqual(self.result, pr2_result)

    def test_build_index_with_force(self):   ## recognizing file extension pairs.gz
        pypairix.build_index(TEST_FILE_LARGE_CHR, force=1)
        pr2 = pypairix.open(TEST_FILE_LARGE_CHR)
        query = '{}:{}-{}|{}:{}-{}'.format(self.chrom, self.start, self.end, self.chrom2, self.start2, self.end2)
        it2 = pr2.querys2D(query)
        pr2_result = build_it_result(it2, self.f_type)
        self.assertEqual(self.result, pr2_result)
Ejemplo n.º 27
0
def read_pairix(filepath, names, blocks=None, chromsizes=None, **kwargs):
    """
    Read a Pairix-indexed BEDPE-like file as a dask dataframe.

    Parameters
    ----------
    filepath : str
        Path to the pairs or paired-end interval file, not the index file.
        (i.e. omit the .px2 extension).
    names : sequence of str
        Names for the columns in the pairs file.
    blocks : sequence of str or tuple
        List of paired chromosome blocks to load.
        If a list of single chromosome names is given, then all pair
        permutations are loaded.
    chromsizes : dict or Series, optional
        Chromosome lengths to use if chromsizes headers are
        not available.
    chunk_level : {0, 1, 2, 3, 4}
        Increase for a finer partition.

    Returns
    -------
    OrderedDict
        A mapping of chromosome pairs to dask dataframes.

    """
    f = pypairix.open(filepath)
    if chromsizes is None:
        cs = f.get_chromsize()
        if not len(cs):
            raise ValueError(
                "No chromsize headers found in file. "
                "They must be provided explicitly."
            )
        chromsizes = pd.Series(dict([(c, int(s)) for c, s in cs]))

    if blocks is None:
        blocks = [s.split("|") for s in f.get_blocknames()]
    elif isinstance(blocks[0], str):
        blocks = [(ci, cj) for ci in blocks for cj in blocks]

    dct = OrderedDict()
    for chrom1, chrom2 in blocks:
        if chrom1 in chromsizes and chrom2 in chromsizes:
            dct[chrom1, chrom2] = read_pairix_block(
                filepath, (chrom1, chrom2), names, chromsizes=chromsizes, **kwargs
            )
    return dct
Ejemplo n.º 28
0
    def __init__(self,
                 filepath,
                 chromsizes,
                 bins,
                 field_numbers=None,
                 field_dtypes=None):
        try:
            import pypairix
        except ImportError:
            raise ImportError(
                "pypairix is required to read pairix-indexed files")

        self._map = map
        self.filepath = filepath
        f = pypairix.open(self.filepath, 'r')
        self.file_contigs = set(
            itertools.chain.from_iterable(
                [b.split('|') for b in f.get_blocknames()]))

        # all requested contigs will be placed in the output matrix
        self.gs = GenomeSegmentation(chromsizes, bins)
        self.bins = self.gs.bins.copy()
        self.bins['chrom'] = self.bins['chrom'].astype(object)
        self.bins['bin'] = self.bins.index

        # warn about requested contigs not seen in the contact list
        for chrom in self.gs.contigs:
            if chrom not in self.file_contigs:
                warnings.warn("Did not find contig " +
                              " '{}' in bg2 file.".format(chrom))

        # Assign the column numbers
        self.field_numbers = self.FIELD_NUMBERS.copy()
        if field_numbers is not None:
            self.field_numbers.update(field_numbers)
        self.columns = list(self.field_numbers.keys())
        self.usecols = list(self.field_numbers.values())

        # Assign the column dtypes. Assume additional value fields are float.
        self.out_columns = ['bin1_id', 'bin2_id', 'count']
        self.dtypes = self.FIELD_DTYPES.copy()
        for col in self.columns:
            if col not in self.dtypes:
                self.out_columns.append(col)
                self.dtypes[col] = float

        # Override defaults
        if field_dtypes is not None:
            self.dtypes.update(field_dtypes)
Ejemplo n.º 29
0
class PairixTest_2(unittest.TestCase):
    f_type = find_pairs_type(TEST_FILE_2D)
    regions = read_pairs(TEST_FILE_2D, f_type)
    chrom = '10'
    start = 25944
    end = 27000000
    chrom2 = '20'
    result = get_result_2D(regions, chrom, start, end, chrom2, 0, sys.maxsize)
    pr = pypairix.open(TEST_FILE_2D)

    def test_querys(self):
        query = '{}:{}-{}|{}'.format(self.chrom, self.start, self.end, self.chrom2)
        it = self.pr.querys2D(query)
        pr_result = build_it_result(it, self.f_type)
        self.assertEqual(self.result, pr_result)
Ejemplo n.º 30
0
    def test_blocknames(self):

        # block list obtained from get_blocknames()
        pr = pypairix.open(TEST_FILE_2D)
        retrieved_blocklist = pr.get_blocknames()
        retrieved_blocklist.sort()

        # true block list
        blocklist=[]
        f_type = find_pairs_type(TEST_FILE_2D)
        regions = read_pairs(TEST_FILE_2D, f_type)
        for a in regions:
            blocklist.append(a[0] + '|' + a[3])
        blocklist_uniq = list(set(blocklist))
        blocklist_uniq.sort()

        self.assertEqual(retrieved_blocklist, blocklist_uniq)