def get_dbs(self, sort=False, orientation=None, rm_duplicate=False, dbd_tag=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") if len(self) == 0: return dna_set for rd in self.sequences: if dbd_tag: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.rna.str_rna(), orientation=rd.dna.orientation, data=rd.score) else: dbs = GenomicRegion(chrom=rd.dna.chrom, initial=rd.dna.initial, final=rd.dna.final, name=rd.dna.name, orientation=rd.dna.orientation, data=rd.score) if not orientation: dna_set.add(dbs) else: if orientation == rd.orient: dna_set.add(dbs) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
def get_dbs(self, sort=False, orientation=None, rm_duplicate=False): """Return GenomicRegionSet which contains all DNA binding sites""" dna_set = GenomicRegionSet(name="DNA_binding_sites") for rd in self.sequences: if not orientation: dna_set.add(rd.dna) else: if orientation == rd.orient: dna_set.add(rd.dna) else: pass if sort: dna_set.sort() if rm_duplicate: dna_set.remove_duplicates() return dna_set
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self,listA,listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add(GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add(GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100,100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1',5,10]], []) result = self.setA result.extend(4,4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1',5,10],['chr1',15,20],['chr1',40,50],['chr1',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1',5,10],['chr2',15,20],['chr3',40,50],['chr4',65,75]], []) result = self.setA result.extend(5,5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1',100,200]], []) result = self.setA result.extend(10,10,percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1',15,20],['chr1',40,50],['chr1',65,75],['chr1',5,10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1',5,10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1',5,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1',1,5],['chr1',11,20]], [['chr1',5,11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1',2,2],['chr1',20,20]], [['chr1',5,5],['chr1',20,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets([['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]], [['chr1',1,10],['chr1',500,550],['chr1',600,650],['chr1',700,750],['chr1',725,800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10]], [['chr1',7,20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,10],['chr1',26,35]], [['chr1',7,15],['chr1',30,40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1',1,10]], [['chr2',1,10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1',1,50]], [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], [['chr1',1,50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1',1,50],['chr1',20,40],['chr1',70,80]], [['chr1',25,45],['chr1',65,95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1',1,10],['chr1',1,15],['chr1',20,25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets([['chr1',1,10],['chr1',1,10],['chr1',15,25],['chr1',30,35],['chr1',40,45],['chr1',40,45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1',200,300]], [['chr1',1,101],['chr1',499,550]]) result = self.setA.window(self.setB,adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1',3000,3500],['chr1',4000,4500]], [['chr1',1500,2500],['chr1',5000,5500]]) result = self.setA.window(self.setB,adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB,adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1',6,15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',1,10]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1',6,15]], [['chr1',1,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1',6,10]], [['chr1',1,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1',1,15]], [['chr1',6,10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1',6,15]], [['chr1',6,15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1',5,30],['chr1',70,85]], [['chr1',20,50],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1',20,30],['chr1',35,55]], [['chr1',10,23],['chr1',100,110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1',0,30000],['chr2',0,35000]], [['chr1',20000,23000],['chr2',31000,35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1',5,100],['chr1',20,40],['chr1',60,80],['chr1',95,150],['chr1',180,220]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240]]) result = self.setA.subtract(self.setB) #print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1',5,1000],['chr2',5,1000],['chr4',5,1000]], [['chr1',10,15],['chr1',30,70],['chr1',120,140],['chr1',200,240], ['chr2',10,15],['chr2',30,70],['chr2',120,140],['chr2',200,240], ['chr4',10,15],['chr4',30,70],['chr4',120,140],['chr4',200,240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',11,20],['chr1',40,50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1',1,30],['chr1',20,40],['chr1',50,60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1',1,30],['chr1',1,30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1',1,10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1',1,10],['chr1',10,20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1',1,10],['chr1',15,25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1',1,10],['chr1',15,25],['chr1',35,45], ['chr1',60,70],['chr1',90,100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1',60,75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1',60,75],['chr1',90,100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets([['chr1',50,58],['chr1',70,80],['chr1',90,94]], [['chr1',45,55],['chr1',76,86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9/33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19",chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,10000],['chr2',0,20000],['chrX',0,30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() #print("-"*80) #print("The result random regions are: ") #for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) #print("Overlaps within result: ",result.within_overlap()) self.region_sets([['chr1',0,1000],['chr2',0,2000],['chrX',0,3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
class TestGenomicRegionSet(unittest.TestCase): def region_sets(self, listA, listB): """ Setting two GenomicRegionSets as self.setA and self.setB for each case test. """ self.setA = GenomicRegionSet('for Unit Test') for i in range(len(listA)): self.setA.add( GenomicRegion(chrom=listA[i][0], initial=listA[i][1], final=listA[i][2])) self.setB = GenomicRegionSet('for Unit Test') for i in range(len(listB)): self.setB.add( GenomicRegion(chrom=listB[i][0], initial=listB[i][1], final=listB[i][2])) def test_extend(self): """ Two empty sets A : none R : none """ self.region_sets([], []) self.setA.extend(100, 100) self.assertEqual(len(self.setA.sequences), 0) """ One region A : ----- R : --------- """ self.region_sets([['chr1', 5, 10]], []) result = self.setA result.extend(4, 4) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 14) """ Many region A : ----- ------ ----- ----- R : --------=--------- ------------------ """ self.region_sets([['chr1', 5, 10], ['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) """ Many region in different chromosome A : ----- ------ ----- ----- R : none """ self.region_sets([['chr1', 5, 10], ['chr2', 15, 20], ['chr3', 40, 50], ['chr4', 65, 75]], []) result = self.setA result.extend(5, 5) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 15) self.assertEqual(result[0].chrom, 'chr1') self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 25) self.assertEqual(result[1].chrom, 'chr2') self.assertEqual(result[2].initial, 35) self.assertEqual(result[2].final, 55) self.assertEqual(result[2].chrom, 'chr3') self.assertEqual(result[3].initial, 60) self.assertEqual(result[3].final, 80) self.assertEqual(result[3].chrom, 'chr4') """ One region A : ----- R : --------- """ self.region_sets([['chr1', 100, 200]], []) result = self.setA result.extend(10, 10, percentage=True) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 90) self.assertEqual(result[0].final, 210) def test_sort(self): self.region_sets([['chr1', 15, 20], ['chr1', 40, 50], ['chr1', 65, 75], ['chr1', 5, 10]], []) self.setA.sort() def test_intersect(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ One empty set A : ----- B : none R : none """ self.region_sets([['chr1', 5, 10]], []) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : none B : ----- R : none """ self.region_sets([], [['chr1', 5, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No overlapping A : ------ --------- ------- B : ---- ------ ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20], ['chr1', 33, 38]], [['chr1', 7, 9], ['chr1', 20, 25], ['chr1', 26, 31]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ End-to-end attach A : ------ ------ B : ------ R : none """ self.region_sets([['chr1', 1, 5], ['chr1', 11, 20]], [['chr1', 5, 11]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ No length attach A : . . B : . . R : none """ self.region_sets([['chr1', 2, 2], ['chr1', 20, 20]], [['chr1', 5, 5], ['chr1', 20, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Perfect overlapping A : ------ B : ------ R : ------ """ self.region_sets( [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]], [['chr1', 1, 10], ['chr1', 500, 550], ['chr1', 600, 650], ['chr1', 700, 750], ['chr1', 725, 800]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP, rm_duplicates=True) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ One overlapping region A : ------ B : -------- R1: -- (overlap) R2: ------ (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10]], [['chr1', 7, 20]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two simple overlapping regions A : ------- -------- B : ------------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 30]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 30) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Two separately overlapping regions A : ------- -------- B : ----- -------- R1: --- ---- (overlap) R2: ------- -------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 10], ['chr1', 26, 35]], [['chr1', 7, 15], ['chr1', 30, 40]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 7) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 30) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 26) self.assertEqual(result[1].final, 35) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Many various overlapping (mixed) A : ------------------ -------- --------- B : ---- ------- ------ ---------- R1: -- ------- -- ---- --- (overlap) R2: ------------------ -------- --------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 3, 30], ['chr1', 50, 60], ['chr1', 70, 85]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 27, 35], ['chr1', 55, 75]]) result = self.setA.intersect(self.setB, mode=OverlapType.OVERLAP) self.assertEqual(len(result), 5) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 27) self.assertEqual(result[2].final, 30) self.assertEqual(result[3].initial, 55) self.assertEqual(result[3].final, 60) self.assertEqual(result[4].initial, 70) self.assertEqual(result[4].final, 75) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 3) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 85) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Different chromosomes A : chr1 ------- B : chr2 ------- R : none """ self.region_sets([['chr1', 1, 10]], [['chr2', 1, 10]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 0) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ Completely included overlapping A : --------------------------- B : ---- ------ ----------- R1: ---- ------ ------ (overlap) R2: --------------------------- (original) R3: (comp_incl) """ self.region_sets([['chr1', 1, 50]], [['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 0) """ A : ---- ------ ----------- B : --------------------------- R1: ---- ------ ------ (overlap) R2: ---- ------ ----------- (original) R3: ---- ------ (comp_incl) """ self.region_sets([['chr1', 1, 5], ['chr1', 10, 19], ['chr1', 45, 60]], [['chr1', 1, 50]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 50) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) self.assertEqual(result[2].initial, 45) self.assertEqual(result[2].final, 60) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 5) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 19) """ A : -------------- ------- ------ B : ----- ---------------- R1: ----- ------- (overlap) ---- R2: -------------- ------- (original) ------ R3: ------- (comp_incl) """ self.region_sets([['chr1', 1, 50], ['chr1', 20, 40], ['chr1', 70, 80]], [['chr1', 25, 45], ['chr1', 65, 95]]) result = self.setA.intersect(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 25) self.assertEqual(result[0].final, 45) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.ORIGINAL) self.assertEqual(len(result), 3) self.assertEqual(result[1].initial, 20) self.assertEqual(result[1].final, 40) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 50) self.assertEqual(result[2].initial, 70) self.assertEqual(result[2].final, 80) result = self.setA.intersect(self.setB, mode=OverlapType.COMP_INCL) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 70) self.assertEqual(result[0].final, 80) def test_closest(self): """ Two empty sets A : none B : none R : none """ self.region_sets([], []) result = self.setA.closest(self.setB) self.assertEqual(len(result), 0) # """ # One empty set # A : ----- # B : none # R : none # """ # self.region_sets([['chr1',5,10]], # []) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # A : none # B : ----- # R : none # """ # self.region_sets([], # [['chr1',5,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Overlapping within set # A : -----====----- # B : ---- # R : ---- # """ # self.region_sets([['chr1',1,10],['chr1',6,15]], # [['chr1',6,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # A : ---- # B : -----====----- # R : -----====----- # """ # self.region_sets([['chr1',6,10]], # [['chr1',1,10],['chr1',6,15]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # """ # No overlapping # A : ------ --------- ------- # B : ---- ------ ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20],['chr1',33,38]], # [['chr1',7,9],['chr1',20,25],['chr1',26,31]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # # self.assertEqual(result[0].initial, 20) # # self.assertEqual(result[0].final, 25) # """ # End-to-end attach # A : ------ ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,5],['chr1',11,20]], # [['chr1',5,11]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # # self.assertEqual(result[0].initial, 5) # # self.assertEqual(result[0].final, 11) # """ # Perfect overlapping # A : ------ # B : ------ # R : ------ # """ # self.region_sets([['chr1',1,10]], # [['chr1',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 1) # self.assertEqual(result[0].final, 10) # """ # One overlapping region # A : ------ # B : -------- # R : -------- # """ # self.region_sets([['chr1',1,10]], # [['chr1',7,20]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 20) # """ # Two simple overlapping regions # A : ------- -------- # B : ------------- # R : ------------- # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,30]]) # result = self.setA.closest(self.setB) # self.assertEqual(result[0].initial, 7) # self.assertEqual(result[0].final, 30) # """ # Two separately overlapping regions # A : ------- -------- # B : ----- -------- # R : none # """ # self.region_sets([['chr1',1,10],['chr1',26,35]], # [['chr1',7,15],['chr1',30,40]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 2) # """ # Many various overlapping (mixed) # A : ------------------ -------- --------- # B : ---- ------- ------ ---------- # R : none # """ # self.region_sets([['chr1',3,30],['chr1',50,60],['chr1',70,85]], # [['chr1',1,5],['chr1',10,19],['chr1',27,35],['chr1',55,75]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 4) # """ # Different chromosomes # A : chr1 ------- # B : chr2 ------- # R : chr2 ------- # # """ # self.region_sets([['chr1',1,10]], # [['chr2',1,10]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 0) # """ # Completely included overlapping # A : --------------------------- # B : ---- ------ ----------- # R : ---- ------ ----------- # """ # self.region_sets([['chr1',1,50]], # [['chr1',1,5],['chr1',10,19],['chr1',45,60]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 3) # """ # A : ---- ------ ----------- # B : --------------------------- # R : none # """ # self.region_sets([['chr1',1,5],['chr1',10,19],['chr1',45,60]], # [['chr1',1,50]]) # result = self.setA.closest(self.setB) # self.assertEqual(result, False) # """ # A : ---- ------ --- # B : --- ----- # R : --- # """ # self.region_sets([['chr1',1,5],['chr1',27,45],['chr1',85,95]], # [['chr1',15,20],['chr1',55,65]]) # result = self.setA.closest(self.setB) # self.assertEqual(len(result), 1) # self.assertEqual(result[0].initial, 15) # self.assertEqual(result[0].final, 20) def test_remove_duplicates(self): """ A : ===== ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A : =====--- ----- R : =====--- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 1, 15], ['chr1', 20, 25]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 1) self.assertEqual(result[1].final, 15) self.assertEqual(result[2].initial, 20) self.assertEqual(result[2].final, 25) """ A : ===== ----- ------ ==== R : ----- ----- ------ ---- """ self.region_sets( [['chr1', 1, 10], ['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 30, 35], ['chr1', 40, 45], ['chr1', 40, 45]], []) self.setA.remove_duplicates() result = self.setA self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) self.assertEqual(result[2].initial, 30) self.assertEqual(result[2].final, 35) self.assertEqual(result[3].initial, 40) self.assertEqual(result[3].final, 45) def test_window(self): """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 100 R : - only one base overlaps with extending A """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 100) self.assertEqual(result[0].final, 101) """ A : ------- B : ------[ 99 ] [ 199 ]--- window = 200 R : ------ - left-hand side is covered, and the right-hand side is only one base overlapped """ self.region_sets([['chr1', 200, 300]], [['chr1', 1, 101], ['chr1', 499, 550]]) result = self.setA.window(self.setB, adding_length=200) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) # GenomicRegion.extend will choose 1 rather than 0 self.assertEqual(result[0].final, 101) self.assertEqual(result[1].initial, 499) self.assertEqual(result[1].final, 500) """ A : ---- ---- B : -------- ---- window = 1000 (default) R : ---- ---- """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 2000) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) """ A : ---- ---- B : -------- ---- window = 2000 R : -------- ---- ---- ---- window = 100 R : none """ self.region_sets([['chr1', 3000, 3500], ['chr1', 4000, 4500]], [['chr1', 1500, 2500], ['chr1', 5000, 5500]]) result = self.setA.window(self.setB, adding_length=2000) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1500) self.assertEqual(result[0].final, 2500) self.assertEqual(result[1].initial, 5000) self.assertEqual(result[1].final, 5500) result = self.setA.window(self.setB, adding_length=100) self.assertEqual(len(result), 0) def test_subtract(self): """ A : none B : ------ R : none """ self.region_sets([], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ------ B : none R : ------ """ self.region_sets([['chr1', 6, 15]], []) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 6) self.assertEqual(result[0].final, 15) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 1, 10]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) """ A : ------ B : ------ R : --- """ self.region_sets([['chr1', 6, 15]], [['chr1', 1, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 10) self.assertEqual(result[0].final, 15) """ A : --- B : --------- R : none """ self.region_sets([['chr1', 6, 10]], [['chr1', 1, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : --------- B : --- R : --- --- """ self.region_sets([['chr1', 1, 15]], [['chr1', 6, 10]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 6) self.assertEqual(result[1].initial, 10) self.assertEqual(result[1].final, 15) """ A : ------ B : ------ R : none """ self.region_sets([['chr1', 6, 15]], [['chr1', 6, 15]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 0) """ A : ---------- ------ B : ---------- ---- R : ------- ------ """ self.region_sets([['chr1', 5, 30], ['chr1', 70, 85]], [['chr1', 20, 50], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 5) self.assertEqual(result[0].final, 20) self.assertEqual(result[1].initial, 70) self.assertEqual(result[1].final, 85) """ A : ------ ----- B : ------ R : ---- ----- """ self.region_sets([['chr1', 20, 30], ['chr1', 35, 55]], [['chr1', 10, 23], ['chr1', 100, 110]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 23) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 35) self.assertEqual(result[1].final, 55) """ A : ch1 --------------------- ch2 ------------------------- B : ch1 ------ ch2 ------ R : ch1 -------- ------- ch2 ------------------- """ self.region_sets([['chr1', 0, 30000], ['chr2', 0, 35000]], [['chr1', 20000, 23000], ['chr2', 31000, 35000]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 3) self.assertEqual(result[0].initial, 0) self.assertEqual(result[0].final, 20000) self.assertEqual(result[1].initial, 23000) self.assertEqual(result[1].final, 30000) self.assertEqual(result[2].initial, 0) self.assertEqual(result[2].final, 31000) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets([['chr1', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 5) """ A : ----------------------- ------ ----- ----- ----------- B : --- --------- ---- ---- R : - ---- ------ ---- --- --- ---- --- """ self.region_sets([['chr1', 5, 100], ['chr1', 20, 40], ['chr1', 60, 80], ['chr1', 95, 150], ['chr1', 180, 220]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240]]) result = self.setA.subtract(self.setB) # print(result.sequences) self.assertEqual(len(result), 8) self.assertEqual(result[0].initial, 5) """ A : ----------------------------------------------------------- B : --- --------- ---- ---- R : - ---- --------- ----------- -------------- """ self.region_sets( [['chr1', 5, 1000], ['chr2', 5, 1000], ['chr4', 5, 1000]], [['chr1', 10, 15], ['chr1', 30, 70], ['chr1', 120, 140], ['chr1', 200, 240], ['chr2', 10, 15], ['chr2', 30, 70], ['chr2', 120, 140], ['chr2', 200, 240], ['chr4', 10, 15], ['chr4', 30, 70], ['chr4', 120, 140], ['chr4', 200, 240]]) result = self.setA.subtract(self.setB) self.assertEqual(len(result), 15) def test_merge(self): """ A : none R : none """ self.region_sets([], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 0) """ A : ----- ----- R : ----- ----- """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) """ A1: ------------ ---- A2: ----- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 11, 20], ['chr1', 40, 50]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) self.assertEqual(result[1].initial, 40) self.assertEqual(result[1].final, 50) """ A1: -------- ---- A2: --------- R : ------------ ---- """ self.region_sets([['chr1', 1, 30], ['chr1', 20, 40], ['chr1', 50, 60]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 40) self.assertEqual(result[1].initial, 50) self.assertEqual(result[1].final, 60) """ A : ======= R : ------- """ self.region_sets([['chr1', 1, 30], ['chr1', 1, 30]], []) self.setA.merge() result = self.setA self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 30) def test_cluster(self): """ Empty sets A : none R : none """ self.region_sets([], []) result = self.setA.cluster(10) self.assertEqual(len(result), 0) """ A : ------- R : ------- """ self.region_sets([['chr1', 1, 10]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) """ A : ----- ------ R : ----------- """ self.region_sets([['chr1', 1, 10], ['chr1', 10, 20]], []) result = self.setA.cluster(10) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 20) """ A : ----- ----- R1: ----- ----- R2: ------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25]], []) result = self.setA.cluster(1) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(5) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 10) self.assertEqual(result[1].initial, 15) self.assertEqual(result[1].final, 25) result = self.setA.cluster(6) self.assertEqual(len(result), 1) self.assertEqual(result[0].initial, 1) self.assertEqual(result[0].final, 25) """ A : ---- ---- ---- ---- ---- R1: --------- ---- ---- ---- R2: --------------- ---- ---- R3: ---------------------- ---- R4: ------------------------------ R5: ------------------------------ """ self.region_sets([['chr1', 1, 10], ['chr1', 15, 25], ['chr1', 35, 45], ['chr1', 60, 70], ['chr1', 90, 100]], []) result = self.setA.cluster(6) self.assertEqual(len(result), 4) result = self.setA.cluster(11) self.assertEqual(len(result), 3) result = self.setA.cluster(16) self.assertEqual(len(result), 2) result = self.setA.cluster(21) self.assertEqual(len(result), 1) result = self.setA.cluster(26) self.assertEqual(len(result), 1) def test_flank(self): """ A : ----- R1: --- --- """ self.region_sets([['chr1', 60, 75]], []) result = self.setA.flank(10) self.assertEqual(len(result), 2) self.assertEqual(result[0].initial, 50) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 85) """ A : ----- ---- R1: ----- ===== ---- """ self.region_sets([['chr1', 60, 75], ['chr1', 90, 100]], []) result = self.setA.flank(15) self.assertEqual(len(result), 4) self.assertEqual(result[0].initial, 45) self.assertEqual(result[0].final, 60) self.assertEqual(result[1].initial, 75) self.assertEqual(result[1].final, 90) self.assertEqual(result[2].initial, 75) self.assertEqual(result[2].final, 90) self.assertEqual(result[3].initial, 100) self.assertEqual(result[3].final, 115) def test_jaccard(self): """ self --8-- ---10--- -4- y ---10--- ---10--- intersect -5- -4- similarity: ( 5 + 4 )/[(8 + 10 + 4) + (10 +10) - (5 + 4 )] = 9/33 """ self.region_sets( [['chr1', 50, 58], ['chr1', 70, 80], ['chr1', 90, 94]], [['chr1', 45, 55], ['chr1', 76, 86]]) result = self.setA.jaccard(self.setB) self.assertEqual(result, 9 / 33) def test_get_genome_data(self): """hg19""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19") self.assertEqual(len(result), 23) """hg19, with Mitochondria chromosome""" result = GenomicRegionSet("hg19") result.get_genome_data(organism="hg19", chrom_M=True) self.assertEqual(len(result), 24) def test_random_regions(self): self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=False, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 10000], ['chr2', 0, 20000], ['chrX', 0, 30000]], []) result = self.setA.random_regions(organism="mm9", total_size=100, overlap_result=True, overlap_input=True) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False) result.sort() # print("-"*80) # print("The result random regions are: ") # for s in result.sequences: # print("\t%s\t%10d\t%10d%10d" % (s.chrom,s.initial,s.final,s.__len__())) # print("Overlaps within result: ",result.within_overlap()) self.region_sets( [['chr1', 0, 1000], ['chr2', 0, 2000], ['chrX', 0, 3000]], []) result = self.setA.random_regions(organism="mm9", multiply_factor=100, overlap_result=False, overlap_input=False, chrom_M=True) result.sort()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()
def load_exon_sequence(bed, directory, genome_path): """Load the exon sequence from the the transcripts. Input BED format should contain: blockCount - The number of blocks (exons) in the BED line. blockSizes - A comma-separated list of the block sizes. blockStarts - A comma-separated list of block starts. see details: http://genome.ucsc.edu/FAQ/FAQformat#format1 Output: Each FASTA file represants a transcript and contains all the exons within the file. """ regionset = GenomicRegionSet("bed") regionset.read_bed(bed) regionset.sort() genome = pysam.Fastafile(genome_path) try: if len(regionset.sequences[0].data.split("\t")) == 7: blockinfor = True no_exon = False except: blockinfor = False regionset.sequences.sort(key=lambda g: g.name) no_exon = True if blockinfor: for gr in regionset: if not gr.name: print("Error: For fetching exon sequences, please define the transcript name.") sys.exit() else: if not os.path.exists(directory): os.makedirs(directory) f = open(os.path.join(directory, gr.name+".fa"), "w") data = gr.data.split("\t") #print(len(data)) if len(data) == 7: #print(data) n = int(data[4]) blocks = [ int(b) for b in filter(None, data[5].split(",")) ] starts = [ int(s) for s in filter(None, data[6].split(",")) ] printstr = [] for i in range(n): start = gr.initial + starts[i] end = start + blocks[i] if no_exon and i == 0: ex = "" elif gr.orientation == "-": ex = "exon:"+str(n-i) else: ex = "exon:"+str(i+1) if gr.orientation == "-": seq = Seq(genome.fetch(gr.chrom, start-1, end-1), IUPAC.unambiguous_dna) seq = seq.reverse_complement() p = [ ">"+ " ".join([ gr.name, ex, "_".join(["REGION",gr.chrom, str(start),str(end), gr.orientation]) ]), seq ] printstr.append(p) else: p = [ ">"+ " ".join([gr.name, ex, "_".join(["REGION",gr.chrom,str(start),str(end), gr.orientation]) ]), genome.fetch(gr.chrom, start-1, end-1) ] printstr.append(p) if gr.orientation == "-": printstr = printstr[::-1] for i in range(n): print(printstr[i][0], file=f) print(printstr[i][1], file=f) else: print("Warning: The given regions have no block information, please try write_bed_blocks") f.close() else: pre_id = "" for gr in regionset: if not gr.name: gr.name = gr.toString() if pre_id == "": pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) elif gr.name == pre_id: z.add(gr) else: f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close() pre_id = gr.name z = GenomicRegionSet(gr.name) z.add(gr) # Last TX f = open(os.path.join(directory, pre_id+".fa"), "w") for i, g in enumerate(z): try: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final), gr.orientation]) except: regiontag = "_".join(["REGION", g.chrom, str(g.initial),str(g.final)] ) print( ">"+ " ".join([g.name, regiontag ]), file=f) print(genome.fetch(g.chrom, g.initial, g.final), file=f) f.close()