def helper_check_region_with_chromosome(self, chromosome): start = 1 stop = 8 regions_file_path = create_single_region_gff_file( self.dir_path, chromosome, start, stop) bam_file_path = create_bam(self.dir_path, [chromosome], self.sequence) liquidator = blb.RegionLiquidator(regions_file=regions_file_path, output_directory=os.path.join( self.dir_path, 'output'), bam_file_path=bam_file_path) with tables.open_file(liquidator.counts_file_path) as counts: self.assertEqual(1, len( counts.root.files)) # 1 since only a single bam file self.assertEqual( 1, counts.root.files[0]['length']) # 1 since only a single read record = counts.root.region_counts[0] self.assertEqual(start, record['start']) self.assertEqual(stop, record['stop']) self.assertEqual( stop - start, record['count'] ) # count represents how many base pair reads intersected # the region counts.root.files[0]['length'] factor = (1 / (stop - start)) * (1 / (1 / 10**6)) self.assertEqual(record['count'] * factor, record['normalized_count'])
def test_empty_region_file(self): empty_file_path = os.path.join(self.dir_path, 'empty.gff') open(empty_file_path, 'w').close() liquidator = blb.RegionLiquidator(regions_file=empty_file_path, output_directory=os.path.join( self.dir_path, 'output'), bam_file_path=self.bam_file_path) liquidator.batch(extension=0, sense='.')
def test_region_liquidation(self): start = 1 stop = 8 region_name = 'region_f' regions_file_path = create_single_region_gff_file( self.dir_path, self.chromosome, start, stop, region_name=region_name) liquidator = blb.RegionLiquidator(regions_file=regions_file_path, output_directory=os.path.join( self.dir_path, 'output'), bam_file_path=self.bam_file_path) liquidator.flatten() matrix_path = os.path.join(self.dir_path, 'matrix.gff') blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path) with tables.open_file(liquidator.counts_file_path) as counts: self.assertEqual(1, len( counts.root.files)) # 1 since only a single bam file self.assertEqual( 1, counts.root.files[0]['length']) # 1 since only a single read self.assertEqual(1, len( counts.root.region_counts)) # 1 since only a single region record = counts.root.region_counts[0] self.assertEqual(region_name, record['region_name'].decode()) self.assertEqual(start, record['start']) self.assertEqual(stop, record['stop']) self.assertEqual( stop - start, record['count'] ) # count represents how many base pair reads intersected # the region # todo: add normalization record checks with open(matrix_path, 'r') as matrix_file: matrix_lines = matrix_file.readlines() self.assertEqual(2, len(matrix_lines)) header_cols = matrix_lines[0].split('\t') self.assertEqual(3, len(header_cols)) self.assertEqual('GENE_ID', header_cols[0]) self.assertEqual('locusLine', header_cols[1]) self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2]) data_cols = matrix_lines[1].split('\t') self.assertEqual(3, len(data_cols)) self.assertEqual(region_name, data_cols[0]) self.assertEqual('chr1(.):1-8', data_cols[1]) # todo: don't hardcode these values self.assertEqual('1000000.0\n', data_cols[2])
def test_region_other_extension(self): start = 1 stop = len(self.sequence) regions_file_path = create_single_region_gff_file( self.dir_path, self.chromosome, start, stop, file_name='single.txt') region_liquidator = blb.RegionLiquidator( regions_file=regions_file_path, region_format='gff', output_directory=os.path.join(self.dir_path, 'region_output'), bam_file_path=self.bam_file_path)
def test_region_long_bam_file_name(self): long_file_name = 'x' * 65 # more than Float64Col long_file_path = os.path.join(self.dir_path, long_file_name) shutil.copyfile(self.bam_file_path, long_file_path) shutil.copyfile(self.bam_file_path + '.bai', long_file_path + '.bai') start = 1 stop = len(self.sequence) regions_file_path = create_single_region_gff_file( self.dir_path, self.chromosome, start, stop) region_liquidator = blb.RegionLiquidator( regions_file=regions_file_path, output_directory=os.path.join(self.dir_path, 'region_output'), bam_file_path=long_file_path)
def test_region_with_wrong_chromosome(self): start = len(self.sequence) + 10 stop = start + 10 regions_file_path = create_single_region_gff_file( self.dir_path, self.chromosome + '0', start, stop) liquidator = blb.RegionLiquidator(regions_file=regions_file_path, output_directory=os.path.join( self.dir_path, 'output'), bam_file_path=self.bam_file_path) with tables.open_file(liquidator.counts_file_path) as counts: self.assertEqual(1, len( counts.root.files)) # 1 since only a single bam file self.assertEqual( 1, counts.root.files[0]['length']) # 1 since only a single read self.assertEqual(0, len( counts.root.region_counts)) # no valid regions
def test_region_with_really_long_name(self): start = 1 stop = 8 region_name = 'r' * 84 truncated_region_name = 'r' * 63 regions_file_path = create_single_region_gff_file( self.dir_path, self.chromosome, start, stop, region_name=region_name) liquidator = blb.RegionLiquidator(regions_file=regions_file_path, output_directory=os.path.join( self.dir_path, 'output'), bam_file_path=self.bam_file_path) liquidator.flatten() matrix_path = os.path.join(self.dir_path, 'matrix.gff') blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path) with tables.open_file(liquidator.counts_file_path) as counts: self.assertEqual(1, len( counts.root.files)) # 1 since only a single bam file self.assertEqual( 1, counts.root.files[0]['length']) # 1 since only a single read self.assertEqual(1, len( counts.root.region_counts)) # 1 since only a single region record = counts.root.region_counts[0] self.assertEqual(truncated_region_name, record['region_name'].decode()) self.assertEqual(start, record['start']) self.assertEqual(stop, record['stop']) self.assertEqual( stop - start, record['count'] ) # count represents how many base pair reads intersected
def test_region_liquidation_with_optional_bed_columns(self): start = 1 stop = 8 region_name = 'region_f' score = 9.9 strand = "." columns = [ "chr1", str(start), str(stop), region_name, str(score), strand, "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts" ] def create_single_region_bed_file(dir_path, columns): region_file_path = os.path.join(dir_path, "single.bed") with open(region_file_path, 'w') as region_file: first_column_written = False for column in columns: if first_column_written: region_file.write('\t') region_file.write('%s' % column) first_column_written = True region_file.write('\n') return region_file_path for number_of_columns in range(0, 9): regions_file_path = create_single_region_bed_file( self.dir_path, columns[:number_of_columns]) try: liquidator = blb.RegionLiquidator( regions_file=regions_file_path, output_directory=os.path.join(self.dir_path, 'output'), bam_file_path=self.bam_file_path) except Exception: self.assertLess(number_of_columns, 3) continue else: self.assertGreaterEqual(number_of_columns, 3) liquidator.flatten() matrix_path = os.path.join(self.dir_path, 'matrix.gff') blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path) with tables.open_file(liquidator.counts_file_path) as counts: self.assertEqual(1, len( counts.root.files)) # 1 since only a single bam file self.assertEqual(1, counts.root.files[0] ['length']) # 1 since only a single read self.assertEqual(1, len( counts.root.region_counts)) # 1 since only a single region record = counts.root.region_counts[0] if number_of_columns >= 4: self.assertEqual(region_name, record['region_name'].decode()) else: self.assertEqual("", record['region_name'].decode()) self.assertEqual(start, record['start']) self.assertEqual(stop, record['stop']) self.assertEqual( stop - start, record['count'] ) # count represents how many base pair reads intersected # the region with open(matrix_path, 'r') as matrix_file: matrix_lines = matrix_file.readlines() self.assertEqual(2, len(matrix_lines)) header_cols = matrix_lines[0].split('\t') self.assertEqual(3, len(header_cols)) self.assertEqual('GENE_ID', header_cols[0]) self.assertEqual('locusLine', header_cols[1]) self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2]) data_cols = matrix_lines[1].split('\t') self.assertEqual(3, len(data_cols)) if number_of_columns >= 4: self.assertEqual(region_name, data_cols[0]) else: self.assertEqual("", data_cols[0]) self.assertEqual( 'chr1(.):1-8', data_cols[1]) # todo: don't hardcode these values self.assertEqual('1000000.0\n', data_cols[2])