Ejemplo n.º 1
0
    def get_dbs(self,
                sort=False,
                orientation=None,
                rm_duplicate=False,
                dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.rna.str_rna(),
                                    orientation=rd.dna.orientation,
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom,
                                    initial=rd.dna.initial,
                                    final=rd.dna.final,
                                    name=rd.dna.name,
                                    orientation=rd.dna.orientation,
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else:
                    pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
Ejemplo n.º 2
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
Ejemplo n.º 3
0
 def get_dbs(self, sort=False, orientation=None, rm_duplicate=False):
     """Return GenomicRegionSet which contains all DNA binding sites"""
     dna_set = GenomicRegionSet(name="DNA_binding_sites")
     for rd in self.sequences:
         if not orientation:
             dna_set.add(rd.dna)
         else:
             if orientation == rd.orient:
                 dna_set.add(rd.dna)
             else: pass
     if sort: dna_set.sort()
     if rm_duplicate: dna_set.remove_duplicates()
     return dna_set
Ejemplo n.º 4
0
    def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False):
        """Return GenomicRegionSet which contains all DNA binding sites"""
        dna_set = GenomicRegionSet(name="DNA_binding_sites")
        if len(self) == 0: return dna_set
        for rd in self.sequences:
            if dbd_tag:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.rna.str_rna(), orientation=rd.dna.orientation, 
                                    data=rd.score)
            else:
                dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final,
                                    name=rd.dna.name, orientation=rd.dna.orientation, 
                                    data=rd.score)

            if not orientation:
                dna_set.add(dbs)
            else:
                if orientation == rd.orient:
                    dna_set.add(dbs)
                else: pass
        if sort: dna_set.sort()
        if rm_duplicate: dna_set.remove_duplicates()
        return dna_set
Ejemplo n.º 5
0
class TestGenomicRegionSet(unittest.TestCase):
    
    def region_sets(self,listA,listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2]))
        
        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2]))
    
    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.extend(100,100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA
        result.extend(4,4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]],
                         [])
        result = self.setA
        result.extend(5,5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1',100,200]],
                         [])
        result = self.setA
        result.extend(10,10,percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)
        
    def test_sort(self):
        self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]],
                         [])
        self.setA.sort()
    
    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1',5,10]],
                         [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([],
                         [['chr1',5,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
                         [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1',1,5],['chr1',11,20]],
                         [['chr1',5,11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1',2,2],['chr1',20,20]],
                         [['chr1',5,5],['chr1',20,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]],
                         [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]])
        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True)
        
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1',1,10]],
                         [['chr1',7,20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1',1,10],['chr1',26,35]],
                         [['chr1',7,15],['chr1',30,40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
                         [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1',1,10]],
                         [['chr2',1,10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1',1,50]],
                         [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
                         [['chr1',1,50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)
        
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)

        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]],
                         [['chr1',25,45],['chr1',65,95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)
        
        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)
    
    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """   
        self.region_sets([['chr1',200,300]],
                         [['chr1',1,101],['chr1',499,550]])
        result = self.setA.window(self.setB,adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """   
        self.region_sets([['chr1',3000,3500],['chr1',4000,4500]],
                         [['chr1',1500,2500],['chr1',5000,5500]])
        result = self.setA.window(self.setB,adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB,adding_length=100)
        self.assertEqual(len(result), 0)
        
    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1',6,15]],
                         [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1',1,10]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',1,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1',6,10]],
                         [['chr1',1,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1',1,15]],
                         [['chr1',6,10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1',6,15]],
                         [['chr1',6,15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1',5,30],['chr1',70,85]],
                         [['chr1',20,50],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1',20,30],['chr1',35,55]],
                         [['chr1',10,23],['chr1',100,110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1',0,30000],['chr2',0,35000]],
                         [['chr1',20000,23000],['chr2',31000,35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]])
        result = self.setA.subtract(self.setB)
        #print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]],
                         [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240],
                          ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240],
                          ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)
        
        
    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1',1,30],['chr1',1,30]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        
    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1',1,10]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1',1,10],['chr1',10,20]],
                         [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25]],
                         [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45],
                          ['chr1',60,70],['chr1',90,100]],
                         [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)
        
    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1',60,75]],
                         [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1',60,75],['chr1',90,100]],
                         [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)
        
    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]],
                         [['chr1',45,55],['chr1',76,86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9/33)
    
    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19",chrom_M=True)
        self.assertEqual(len(result), 24)
        
    def test_random_regions(self):
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=False, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          total_size=100, 
                                          overlap_result=True, 
                                          overlap_input=True)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False)
        result.sort()
        #print("-"*80)
        #print("The result random regions are: ")
        #for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        #print("Overlaps within result: ",result.within_overlap())
        
        self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]],
                         [])
        result = self.setA.random_regions(organism="mm9", 
                                          multiply_factor=100, 
                                          overlap_result=False, 
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
Ejemplo n.º 6
0
class TestGenomicRegionSet(unittest.TestCase):
    def region_sets(self, listA, listB):
        """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """
        self.setA = GenomicRegionSet('for Unit Test')
        for i in range(len(listA)):
            self.setA.add(
                GenomicRegion(chrom=listA[i][0],
                              initial=listA[i][1],
                              final=listA[i][2]))

        self.setB = GenomicRegionSet('for Unit Test')
        for i in range(len(listB)):
            self.setB.add(
                GenomicRegion(chrom=listB[i][0],
                              initial=listB[i][1],
                              final=listB[i][2]))

    def test_extend(self):
        """
        Two empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        self.setA.extend(100, 100)
        self.assertEqual(len(self.setA.sequences), 0)
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA
        result.extend(4, 4)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 14)
        """
        Many region
        A :   -----   ------         -----    -----
        R : --------=---------     ------------------
        """
        self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50],
                          ['chr1', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        """
        Many region in different chromosome
        A :   -----   ------         -----    -----
        R : none
        """
        self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50],
                          ['chr4', 65, 75]], [])
        result = self.setA
        result.extend(5, 5)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 15)
        self.assertEqual(result[0].chrom, 'chr1')
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[1].chrom, 'chr2')
        self.assertEqual(result[2].initial, 35)
        self.assertEqual(result[2].final, 55)
        self.assertEqual(result[2].chrom, 'chr3')
        self.assertEqual(result[3].initial, 60)
        self.assertEqual(result[3].final, 80)
        self.assertEqual(result[3].chrom, 'chr4')
        """
        One region
        A :   -----
        R : ---------
        """
        self.region_sets([['chr1', 100, 200]], [])
        result = self.setA
        result.extend(10, 10, percentage=True)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 90)
        self.assertEqual(result[0].final, 210)

    def test_sort(self):
        self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75],
                          ['chr1', 5, 10]], [])
        self.setA.sort()

    def test_intersect(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        One empty set
        A :   -----
        B : none
        R : none
        """
        self.region_sets([['chr1', 5, 10]], [])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : none
        B :   -----
        R : none
        """
        self.region_sets([], [['chr1', 5, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No overlapping
        A : ------      ---------               ------- 
        B :        ----          ------  ------   
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]],
                         [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        End-to-end attach
        A : ------      ------
        B :       ------
        R : none
        """
        self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        No length attach
        A : .      .
        B :    .   .
        R : none
        """
        self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]],
                         [['chr1', 5, 5], ['chr1', 20, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Perfect overlapping
        A : ------
        B : ------
        R : ------
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]],
            [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650],
             ['chr1', 700, 750], ['chr1', 725, 800]])
        result = self.setA.intersect(self.setB,
                                     mode=OverlapType.OVERLAP,
                                     rm_duplicates=True)

        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        One overlapping region
        A : ------
        B :     --------
        R1:     --       (overlap)
        R2: ------       (original)
        R3:              (comp_incl)
        """

        self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two simple overlapping regions
        A : -------      --------
        B :     -------------
        R1:     ---      ----     (overlap)
        R2: -------      -------- (original)
        R3:                       (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 30]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 30)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Two separately overlapping regions 
        A : -------      --------
        B :     -----        --------
        R1:     ---          ----     (overlap)
        R2: -------      --------     (original)
        R3:                           (comp_incl)
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]],
                         [['chr1', 7, 15], ['chr1', 30, 40]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 7)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 30)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 26)
        self.assertEqual(result[1].final, 35)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Many various overlapping (mixed)
        A :   ------------------            --------   ---------
        B : ----   -------    ------            ----------      
        R1:   --   -------    --                ----   ---       (overlap)
        R2:   ------------------            --------   --------- (original)
        R3:                                                      (comp_incl) 
        """

        self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35],
                          ['chr1', 55, 75]])

        result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP)
        self.assertEqual(len(result), 5)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 27)
        self.assertEqual(result[2].final, 30)
        self.assertEqual(result[3].initial, 55)
        self.assertEqual(result[3].final, 60)
        self.assertEqual(result[4].initial, 70)
        self.assertEqual(result[4].final, 75)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 3)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 85)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Different chromosomes
        A : chr1  -------
        B : chr2  -------
        R : none
        """
        self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 0)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        Completely included overlapping
        A : ---------------------------
        B : ----    ------       -----------
        R1: ----    ------       ------      (overlap)
        R2: ---------------------------      (original)
        R3:                                  (comp_incl)
        """
        self.region_sets([['chr1', 1, 50]],
                         [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 0)
        """
        A : ----    ------       -----------
        B : ---------------------------
        R1: ----    ------       ------      (overlap)
        R2: ----    ------       ----------- (original)
        R3: ----    ------                   (comp_incl)
        """

        self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]],
                         [['chr1', 1, 50]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 50)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        self.assertEqual(result[2].initial, 45)
        self.assertEqual(result[2].final, 60)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 5)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 19)
        """
        A : --------------         -------
                ------
        B :       -----          ----------------
        R1:       -----            -------      (overlap)
                  ----
        R2: --------------         -------      (original)
                ------
        R3:                        -------      (comp_incl)
        """
        self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]],
                         [['chr1', 25, 45], ['chr1', 65, 95]])
        result = self.setA.intersect(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 25)
        self.assertEqual(result[0].final, 45)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[1].initial, 20)
        self.assertEqual(result[1].final, 40)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 50)
        self.assertEqual(result[2].initial, 70)
        self.assertEqual(result[2].final, 80)

        result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 70)
        self.assertEqual(result[0].final, 80)

    def test_closest(self):
        """
        Two empty sets
        A : none 
        B : none
        R : none
        """
        self.region_sets([], [])
        result = self.setA.closest(self.setB)
        self.assertEqual(len(result), 0)
        # """
        # One empty set
        # A :   -----
        # B : none
        # R : none
        # """
        # self.region_sets([['chr1',5,10]],
        #                  [])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # A : none
        # B :   -----
        # R : none
        # """
        # self.region_sets([],
        #                  [['chr1',5,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Overlapping within set
        # A : -----====-----
        # B :      ----
        # R :      ----
        # """
        # self.region_sets([['chr1',1,10],['chr1',6,15]],
        #                  [['chr1',6,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # A :      ----
        # B : -----====-----
        # R : -----====-----
        # """
        # self.region_sets([['chr1',6,10]],
        #                  [['chr1',1,10],['chr1',6,15]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # """
        # No overlapping
        # A : ------      ---------               -------
        # B :        ----          ------  ------
        # R :                      ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]],
        #                  [['chr1',7,9],['chr1',20,25],['chr1',26,31]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # # self.assertEqual(result[0].initial, 20)
        # # self.assertEqual(result[0].final, 25)
        # """
        # End-to-end attach
        # A : ------      ------
        # B :       ------
        # R :       ------
        # """
        # self.region_sets([['chr1',1,5],['chr1',11,20]],
        #                  [['chr1',5,11]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # # self.assertEqual(result[0].initial, 5)
        # # self.assertEqual(result[0].final, 11)
        # """
        # Perfect overlapping
        # A : ------
        # B : ------
        # R : ------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 1)
        # self.assertEqual(result[0].final, 10)
        # """
        # One overlapping region
        # A : ------
        # B :     --------
        # R :     --------
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr1',7,20]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 20)
        # """
        # Two simple overlapping regions
        # A : -------      --------
        # B :     -------------
        # R :     -------------
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,30]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result[0].initial, 7)
        # self.assertEqual(result[0].final, 30)
        # """
        # Two separately overlapping regions
        # A : -------      --------
        # B :     -----        --------
        # R : none
        # """
        # self.region_sets([['chr1',1,10],['chr1',26,35]],
        #                  [['chr1',7,15],['chr1',30,40]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 2)
        # """
        # Many various overlapping (mixed)
        # A :   ------------------            --------   ---------
        # B : ----   -------    ------            ----------
        # R : none
        # """
        # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 4)
        # """
        # Different chromosomes
        # A : chr1  -------
        # B : chr2  -------
        # R : chr2  -------
        #
        # """
        # self.region_sets([['chr1',1,10]],
        #                  [['chr2',1,10]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 0)
        # """
        # Completely included overlapping
        # A : ---------------------------
        # B : ----    ------       -----------
        # R : ----    ------       -----------
        # """
        # self.region_sets([['chr1',1,50]],
        #                  [['chr1',1,5],['chr1',10,19],['chr1',45,60]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 3)
        # """
        # A : ----    ------       -----------
        # B : ---------------------------
        # R : none
        # """
        # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]],
        #                  [['chr1',1,50]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(result, False)
        # """
        # A : ----         ------                  ---
        # B :        ---              -----
        # R :        ---
        # """
        # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]],
        #                  [['chr1',15,20],['chr1',55,65]])
        # result = self.setA.closest(self.setB)
        # self.assertEqual(len(result), 1)
        # self.assertEqual(result[0].initial, 15)
        # self.assertEqual(result[0].final, 20)

    def test_remove_duplicates(self):
        """
        A : ===== -----
        R : ----- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A : =====--- -----
        R : =====--- -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]],
                         [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 1)
        self.assertEqual(result[1].final, 15)
        self.assertEqual(result[2].initial, 20)
        self.assertEqual(result[2].final, 25)
        """
        A : ===== ----- ------  ====
        R : ----- ----- ------  ----
        """
        self.region_sets(
            [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25],
             ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], [])
        self.setA.remove_duplicates()
        result = self.setA
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        self.assertEqual(result[2].initial, 30)
        self.assertEqual(result[2].final, 35)
        self.assertEqual(result[3].initial, 40)
        self.assertEqual(result[3].final, 45)

    def test_window(self):
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 100
        R :       -                           only one base overlaps with extending A
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 100)
        self.assertEqual(result[0].final, 101)
        """
        A :             -------
        B : ------[ 99 ]       [   199   ]---
        window = 200
        R : ------                        -   
        left-hand side is covered, and the right-hand side is only one base overlapped
        """
        self.region_sets([['chr1', 200, 300]],
                         [['chr1', 1, 101], ['chr1', 499, 550]])
        result = self.setA.window(self.setB, adding_length=200)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial,
                         1)  # GenomicRegion.extend will choose 1 rather than 0
        self.assertEqual(result[0].final, 101)
        self.assertEqual(result[1].initial, 499)
        self.assertEqual(result[1].final, 500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 1000 (default)
        R :                 ----                    ----
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 2000)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        """
        A :                         ----    ----
        B :             --------                    ----
        window = 2000
        R :             --------                    ----
                            ----                    ----
        window = 100
        R : none
        """
        self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]],
                         [['chr1', 1500, 2500], ['chr1', 5000, 5500]])
        result = self.setA.window(self.setB, adding_length=2000)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1500)
        self.assertEqual(result[0].final, 2500)
        self.assertEqual(result[1].initial, 5000)
        self.assertEqual(result[1].final, 5500)
        result = self.setA.window(self.setB, adding_length=100)
        self.assertEqual(len(result), 0)

    def test_subtract(self):
        """
        A : none
        B :    ------
        R : none
        """
        self.region_sets([], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :    ------
        B : none
        R :    ------
        """
        self.region_sets([['chr1', 6, 15]], [])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 6)
        self.assertEqual(result[0].final, 15)
        """
        A : ------
        B :    ------
        R : ---
        """
        self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        """
        A :    ------
        B : ------
        R :       ---
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 10)
        self.assertEqual(result[0].final, 15)
        """
        A :    ---
        B : ---------
        R : none
        """
        self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A : ---------
        B :    ---
        R : ---   ---
        """
        self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 6)
        self.assertEqual(result[1].initial, 10)
        self.assertEqual(result[1].final, 15)
        """
        A :    ------
        B :    ------
        R : none
        """
        self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 0)
        """
        A :   ----------              ------
        B :          ----------                    ----
        R :   -------                 ------
        """
        self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]],
                         [['chr1', 20, 50], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 5)
        self.assertEqual(result[0].final, 20)
        self.assertEqual(result[1].initial, 70)
        self.assertEqual(result[1].final, 85)
        """
        A :        ------   -----
        B :    ------
        R :          ----   -----
        """
        self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]],
                         [['chr1', 10, 23], ['chr1', 100, 110]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 23)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 35)
        self.assertEqual(result[1].final, 55)
        """
        A :   ch1     ---------------------
              ch2     -------------------------
        B :   ch1             ------
              ch2                        ------
        R :   ch1     --------      -------
              ch2     -------------------
        """
        self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]],
                         [['chr1', 20000, 23000], ['chr2', 31000, 35000]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 3)
        self.assertEqual(result[0].initial, 0)
        self.assertEqual(result[0].final, 20000)
        self.assertEqual(result[1].initial, 23000)
        self.assertEqual(result[1].final, 30000)
        self.assertEqual(result[2].initial, 0)
        self.assertEqual(result[2].final, 31000)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets([['chr1', 5, 1000]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 5)
        """
        A :   -----------------------              ------
                   -----     -----  -----------
        B :    ---    ---------         ----           ----
        R :   -   ----         ------              ----
                   ---         ---  ----    ---
        """
        self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80],
                          ['chr1', 95, 150], ['chr1', 180, 220]],
                         [['chr1', 10, 15], ['chr1', 30, 70],
                          ['chr1', 120, 140], ['chr1', 200, 240]])
        result = self.setA.subtract(self.setB)
        # print(result.sequences)
        self.assertEqual(len(result), 8)
        self.assertEqual(result[0].initial, 5)
        """
        A :   -----------------------------------------------------------
        B :    ---    ---------         ----           ----
        R :   -   ----         ---------    -----------    --------------
        """
        self.region_sets(
            [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]],
            [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140],
             ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70],
             ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15],
             ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]])
        result = self.setA.subtract(self.setB)
        self.assertEqual(len(result), 15)

    def test_merge(self):
        """
        A : none
        R : none
        """
        self.region_sets([], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 0)
        """
        A : -----  -----
        R : -----  -----
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        """
        A1: ------------   ----
        A2:    -----
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)
        self.assertEqual(result[1].initial, 40)
        self.assertEqual(result[1].final, 50)
        """
        A1: --------       ----
        A2:    ---------
        R : ------------   ----
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]],
                         [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 40)
        self.assertEqual(result[1].initial, 50)
        self.assertEqual(result[1].final, 60)
        """
        A : =======
        R : -------
        """
        self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], [])
        self.setA.merge()
        result = self.setA
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 30)

    def test_cluster(self):
        """
        Empty sets
        A : none 
        R : none
        """
        self.region_sets([], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 0)
        """
        A :  ------- 
        R :  -------
        """
        self.region_sets([['chr1', 1, 10]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        """
        A :  -----
                  ------
        R :  -----------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], [])
        result = self.setA.cluster(10)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 20)
        """
        A :  -----  -----
        R1:  -----  -----
        R2:  ------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], [])
        result = self.setA.cluster(1)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(5)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 10)
        self.assertEqual(result[1].initial, 15)
        self.assertEqual(result[1].final, 25)
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0].initial, 1)
        self.assertEqual(result[0].final, 25)
        """
        A :  ---- ----  ----   ----    ----
        R1:  ---------  ----   ----    ----
        R2:  ---------------   ----    ----
        R3:  ----------------------    ----
        R4:  ------------------------------
        R5:  ------------------------------
        """
        self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45],
                          ['chr1', 60, 70], ['chr1', 90, 100]], [])
        result = self.setA.cluster(6)
        self.assertEqual(len(result), 4)
        result = self.setA.cluster(11)
        self.assertEqual(len(result), 3)
        result = self.setA.cluster(16)
        self.assertEqual(len(result), 2)
        result = self.setA.cluster(21)
        self.assertEqual(len(result), 1)
        result = self.setA.cluster(26)
        self.assertEqual(len(result), 1)

    def test_flank(self):
        """
        A :        -----
        R1:     ---     ---
        """
        self.region_sets([['chr1', 60, 75]], [])
        result = self.setA.flank(10)
        self.assertEqual(len(result), 2)
        self.assertEqual(result[0].initial, 50)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 85)
        """
        A :        -----     ----
        R1:   -----     =====    ----
        """
        self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], [])
        result = self.setA.flank(15)
        self.assertEqual(len(result), 4)
        self.assertEqual(result[0].initial, 45)
        self.assertEqual(result[0].final, 60)
        self.assertEqual(result[1].initial, 75)
        self.assertEqual(result[1].final, 90)
        self.assertEqual(result[2].initial, 75)
        self.assertEqual(result[2].final, 90)
        self.assertEqual(result[3].initial, 100)
        self.assertEqual(result[3].final, 115)

    def test_jaccard(self):
        """
        self           --8--      ---10---      -4-
        y         ---10---             ---10---
        intersect      -5-             -4-    
        similarity:   ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )]
                      = 9/33
        """
        self.region_sets(
            [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]],
            [['chr1', 45, 55], ['chr1', 76, 86]])
        result = self.setA.jaccard(self.setB)
        self.assertEqual(result, 9 / 33)

    def test_get_genome_data(self):
        """hg19"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19")
        self.assertEqual(len(result), 23)
        """hg19, with Mitochondria chromosome"""
        result = GenomicRegionSet("hg19")
        result.get_genome_data(organism="hg19", chrom_M=True)
        self.assertEqual(len(result), 24)

    def test_random_regions(self):

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=False,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          total_size=100,
                                          overlap_result=True,
                                          overlap_input=True)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False)
        result.sort()
        # print("-"*80)
        # print("The result random regions are: ")
        # for s in result.sequences:
        #    print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__()))
        # print("Overlaps within result: ",result.within_overlap())

        self.region_sets(
            [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], [])
        result = self.setA.random_regions(organism="mm9",
                                          multiply_factor=100,
                                          overlap_result=False,
                                          overlap_input=False,
                                          chrom_M=True)
        result.sort()
Ejemplo n.º 7
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()
Ejemplo n.º 8
0
def load_exon_sequence(bed, directory, genome_path):
    """Load the exon sequence from the the transcripts. 
    Input BED format should contain:
        blockCount - The number of blocks (exons) in the BED line.
        blockSizes - A comma-separated list of the block sizes.
        blockStarts - A comma-separated list of block starts. 
        see details: http://genome.ucsc.edu/FAQ/FAQformat#format1

    Output:
        Each FASTA file represants a transcript and contains all the exons within the file.

    """
    regionset = GenomicRegionSet("bed")
    regionset.read_bed(bed)
    regionset.sort()

    
    genome = pysam.Fastafile(genome_path)
    
    try:
        if len(regionset.sequences[0].data.split("\t")) == 7: 
            blockinfor = True
            no_exon = False
    except:
        blockinfor = False
        regionset.sequences.sort(key=lambda g: g.name)
        no_exon = True

    if blockinfor:
        
        for gr in regionset:
            if not gr.name:
                print("Error: For fetching exon sequences, please define the transcript name.")
                sys.exit()
            else:
                if not os.path.exists(directory):
                    os.makedirs(directory)
                f = open(os.path.join(directory, gr.name+".fa"), "w")
                data = gr.data.split("\t")
                #print(len(data))
                if len(data) == 7:
                    #print(data)
                    n = int(data[4])
                    
                    blocks = [ int(b) for b in filter(None, data[5].split(",")) ]
                    starts = [ int(s) for s in filter(None, data[6].split(",")) ]
                    printstr = []

                    for i in range(n):
                        start = gr.initial + starts[i]
                        end = start + blocks[i]
                        if no_exon and i == 0:
                            ex = ""
                        elif gr.orientation == "-":
                            ex = "exon:"+str(n-i)
                        else:
                            ex = "exon:"+str(i+1)

                        if gr.orientation == "-":
                            seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna)
                            seq = seq.reverse_complement()
                            p = [ ">"+ " ".join([ gr.name, 
                                                  ex, 
                                                  "_".join(["REGION",gr.chrom,
                                                            str(start),str(end), 
                                                            gr.orientation]) ]),
                                  seq ]
                            
                            printstr.append(p)
                            

                        else:
                            p = [ ">"+ " ".join([gr.name, ex, 
                                  "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]),
                                  genome.fetch(gr.chrom, start-1, end-1)
                                ]
                            printstr.append(p)
                            

                    if gr.orientation == "-": printstr = printstr[::-1]
                    for i in range(n):
                        print(printstr[i][0], file=f)
                        print(printstr[i][1], file=f)
                        

                else:
                    print("Warning: The given regions have no block information, please try write_bed_blocks")
                f.close()
    else:
        pre_id = ""
        for gr in regionset:
            if not gr.name: 
                gr.name = gr.toString()

            if pre_id == "": 
                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)
            elif gr.name == pre_id:
                z.add(gr)
            else:
                f = open(os.path.join(directory, pre_id+".fa"), "w")
                for i, g in enumerate(z):
                    try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
                    except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )

                    print( ">"+ " ".join([g.name,  
                                          regiontag ]), file=f)
                    print(genome.fetch(g.chrom, g.initial, g.final), file=f)
                f.close()

                pre_id = gr.name
                z = GenomicRegionSet(gr.name)
                z.add(gr)

        # Last TX
        f = open(os.path.join(directory, pre_id+".fa"), "w")
        for i, g in enumerate(z):
            try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation])
            except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] )
            print( ">"+ " ".join([g.name, 
                                  regiontag ]), file=f)
            print(genome.fetch(g.chrom, g.initial, g.final), file=f)
        f.close()