Exemple #1
0
    def test_region_with_really_long_name(self):
        start = 1
        stop  = 8
        region_name = 'r'*84
        truncated_region_name = 'r'*63
        regions_file_path = create_single_region_gff_file(self.dir_path, self.chromosome, start, stop, region_name=region_name)

        liquidator = blb.RegionLiquidator(regions_file = regions_file_path,
                                          output_directory = os.path.join(self.dir_path, 'output'),
                                          bam_file_path = self.bam_file_path)
        liquidator.flatten()

        matrix_path = os.path.join(self.dir_path, 'matrix.gff')
        blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

        with tables.open_file(liquidator.counts_file_path) as counts:
            self.assertEqual(1, len(counts.root.files)) # 1 since only a single bam file
            self.assertEqual(1, counts.root.files[0]['length']) # 1 since only a single read 
            self.assertEqual(1, len(counts.root.region_counts)) # 1 since only a single region 
           
            record = counts.root.region_counts[0]
            self.assertEqual(truncated_region_name, record['region_name'])
            self.assertEqual(start, record['start'])
            self.assertEqual(stop,  record['stop'])
            self.assertEqual(stop-start, record['count']) # count represents how many base pair reads intersected
Exemple #2
0
    def test_region_liquidation(self):
        start = 1
        stop = 8
        region_name = 'region_f'
        regions_file_path = create_single_region_gff_file(
            self.dir_path,
            self.chromosome,
            start,
            stop,
            region_name=region_name)

        liquidator = blb.RegionLiquidator(regions_file=regions_file_path,
                                          output_directory=os.path.join(
                                              self.dir_path, 'output'),
                                          bam_file_path=self.bam_file_path)
        liquidator.flatten()

        matrix_path = os.path.join(self.dir_path, 'matrix.gff')
        blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

        with tables.open_file(liquidator.counts_file_path) as counts:
            self.assertEqual(1, len(
                counts.root.files))  # 1 since only a single bam file
            self.assertEqual(
                1,
                counts.root.files[0]['length'])  # 1 since only a single read
            self.assertEqual(1, len(
                counts.root.region_counts))  # 1 since only a single region

            record = counts.root.region_counts[0]
            self.assertEqual(region_name, record['region_name'].decode())
            self.assertEqual(start, record['start'])
            self.assertEqual(stop, record['stop'])
            self.assertEqual(
                stop - start, record['count']
            )  # count represents how many base pair reads intersected
            # the region

            # todo: add normalization record checks

        with open(matrix_path, 'r') as matrix_file:
            matrix_lines = matrix_file.readlines()
            self.assertEqual(2, len(matrix_lines))

            header_cols = matrix_lines[0].split('\t')
            self.assertEqual(3, len(header_cols))
            self.assertEqual('GENE_ID', header_cols[0])
            self.assertEqual('locusLine', header_cols[1])
            self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2])

            data_cols = matrix_lines[1].split('\t')
            self.assertEqual(3, len(data_cols))
            self.assertEqual(region_name, data_cols[0])
            self.assertEqual('chr1(.):1-8',
                             data_cols[1])  # todo: don't hardcode these values
            self.assertEqual('1000000.0\n', data_cols[2])
Exemple #3
0
    def test_region_liquidation(self):
        start = 1
        stop  = 8
        region_name = 'region_f'
        regions_file_path = create_single_region_gff_file(self.dir_path, self.chromosome, start, stop, region_name=region_name)

        liquidator = blb.RegionLiquidator(regions_file = regions_file_path,
                                          output_directory = os.path.join(self.dir_path, 'output'),
                                          bam_file_path = self.bam_file_path)
        liquidator.flatten()

        matrix_path = os.path.join(self.dir_path, 'matrix.gff')
        blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

        with tables.open_file(liquidator.counts_file_path) as counts:
            self.assertEqual(1, len(counts.root.files)) # 1 since only a single bam file
            self.assertEqual(1, counts.root.files[0]['length']) # 1 since only a single read 
            self.assertEqual(1, len(counts.root.region_counts)) # 1 since only a single region 
           
            record = counts.root.region_counts[0]
            self.assertEqual(region_name, record['region_name'])
            self.assertEqual(start, record['start'])
            self.assertEqual(stop,  record['stop'])
            self.assertEqual(stop-start, record['count']) # count represents how many base pair reads intersected
                                                          # the region

            # todo: add normalization record checks

        with open(matrix_path, 'r') as matrix_file:
           matrix_lines = matrix_file.readlines()
           self.assertEqual(2, len(matrix_lines))

           header_cols = matrix_lines[0].split('\t')
           self.assertEqual(3, len(header_cols))
           self.assertEqual('GENE_ID', header_cols[0])
           self.assertEqual('locusLine', header_cols[1])
           self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2])

           data_cols = matrix_lines[1].split('\t')
           self.assertEqual(3, len(data_cols))
           self.assertEqual(region_name, data_cols[0])
           self.assertEqual('chr1(.):1-8', data_cols[1]) # todo: don't hardcode these values 
           self.assertEqual('1000000.0\n', data_cols[2])
Exemple #4
0
    def test_region_with_really_long_name(self):
        start = 1
        stop = 8
        region_name = 'r' * 84
        truncated_region_name = 'r' * 63
        regions_file_path = create_single_region_gff_file(
            self.dir_path,
            self.chromosome,
            start,
            stop,
            region_name=region_name)

        liquidator = blb.RegionLiquidator(regions_file=regions_file_path,
                                          output_directory=os.path.join(
                                              self.dir_path, 'output'),
                                          bam_file_path=self.bam_file_path)
        liquidator.flatten()

        matrix_path = os.path.join(self.dir_path, 'matrix.gff')
        blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

        with tables.open_file(liquidator.counts_file_path) as counts:
            self.assertEqual(1, len(
                counts.root.files))  # 1 since only a single bam file
            self.assertEqual(
                1,
                counts.root.files[0]['length'])  # 1 since only a single read
            self.assertEqual(1, len(
                counts.root.region_counts))  # 1 since only a single region

            record = counts.root.region_counts[0]
            self.assertEqual(truncated_region_name,
                             record['region_name'].decode())
            self.assertEqual(start, record['start'])
            self.assertEqual(stop, record['stop'])
            self.assertEqual(
                stop - start, record['count']
            )  # count represents how many base pair reads intersected
Exemple #5
0
    def test_region_liquidation_with_optional_bed_columns(self):
        start = 1
        stop = 8
        region_name = 'region_f'
        score = 9.9
        strand = "."

        columns = [
            "chr1",
            str(start),
            str(stop), region_name,
            str(score), strand, "thickStart", "thickEnd", "itemRgb",
            "blockCount", "blockSizes", "blockStarts"
        ]

        def create_single_region_bed_file(dir_path, columns):
            region_file_path = os.path.join(dir_path, "single.bed")
            with open(region_file_path, 'w') as region_file:
                first_column_written = False
                for column in columns:
                    if first_column_written:
                        region_file.write('\t')
                    region_file.write('%s' % column)
                    first_column_written = True
                region_file.write('\n')
            return region_file_path

        for number_of_columns in range(0, 9):
            regions_file_path = create_single_region_bed_file(
                self.dir_path, columns[:number_of_columns])

            try:
                liquidator = blb.RegionLiquidator(
                    regions_file=regions_file_path,
                    output_directory=os.path.join(self.dir_path, 'output'),
                    bam_file_path=self.bam_file_path)
            except Exception:
                self.assertLess(number_of_columns, 3)
                continue
            else:
                self.assertGreaterEqual(number_of_columns, 3)

            liquidator.flatten()

            matrix_path = os.path.join(self.dir_path, 'matrix.gff')
            blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

            with tables.open_file(liquidator.counts_file_path) as counts:
                self.assertEqual(1, len(
                    counts.root.files))  # 1 since only a single bam file
                self.assertEqual(1, counts.root.files[0]
                                 ['length'])  # 1 since only a single read
                self.assertEqual(1, len(
                    counts.root.region_counts))  # 1 since only a single region

                record = counts.root.region_counts[0]
                if number_of_columns >= 4:
                    self.assertEqual(region_name,
                                     record['region_name'].decode())
                else:
                    self.assertEqual("", record['region_name'].decode())
                self.assertEqual(start, record['start'])
                self.assertEqual(stop, record['stop'])
                self.assertEqual(
                    stop - start, record['count']
                )  # count represents how many base pair reads intersected
                # the region

            with open(matrix_path, 'r') as matrix_file:
                matrix_lines = matrix_file.readlines()
                self.assertEqual(2, len(matrix_lines))

                header_cols = matrix_lines[0].split('\t')
                self.assertEqual(3, len(header_cols))
                self.assertEqual('GENE_ID', header_cols[0])
                self.assertEqual('locusLine', header_cols[1])
                self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2])

                data_cols = matrix_lines[1].split('\t')
                self.assertEqual(3, len(data_cols))
                if number_of_columns >= 4:
                    self.assertEqual(region_name, data_cols[0])
                else:
                    self.assertEqual("", data_cols[0])
                self.assertEqual(
                    'chr1(.):1-8',
                    data_cols[1])  # todo: don't hardcode these values
                self.assertEqual('1000000.0\n', data_cols[2])
Exemple #6
0
    def test_region_liquidation_with_optional_bed_columns(self):
        start = 1
        stop  = 8
        region_name = 'region_f'
        score = 9.9
        strand = "."

        columns = ["chr1", str(start), str(stop), region_name, str(score), strand,
                   "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts"]

        def create_single_region_bed_file(dir_path, columns):
            region_file_path = os.path.join(dir_path, "single.bed") 
            with open(region_file_path, 'w') as region_file:
                first_column_written = False
                for column in columns:
                    if first_column_written:
                        region_file.write('\t')
                    region_file.write('%s' % column)
                    first_column_written = True
                region_file.write('\n')
            return region_file_path
            
        for number_of_columns in range(0, 9):
            regions_file_path = create_single_region_bed_file(self.dir_path, columns[:number_of_columns])

            try:
                liquidator = blb.RegionLiquidator(regions_file = regions_file_path,
                                                  output_directory = os.path.join(self.dir_path, 'output'),
                                                  bam_file_path = self.bam_file_path)
            except Exception:
                self.assertLess(number_of_columns, 3)
                continue
            else:
                self.assertGreaterEqual(number_of_columns, 3)

            liquidator.flatten()

            matrix_path = os.path.join(self.dir_path, 'matrix.gff')
            blb.write_bamToGff_matrix(matrix_path, liquidator.counts_file_path)

            with tables.open_file(liquidator.counts_file_path) as counts:
                self.assertEqual(1, len(counts.root.files)) # 1 since only a single bam file
                self.assertEqual(1, counts.root.files[0]['length']) # 1 since only a single read 
                self.assertEqual(1, len(counts.root.region_counts)) # 1 since only a single region 
               
                record = counts.root.region_counts[0]
                if number_of_columns >= 4:
                   self.assertEqual(region_name, record['region_name'])
                else:
                   self.assertEqual("", record['region_name'])
                self.assertEqual(start, record['start'])
                self.assertEqual(stop,  record['stop'])
                self.assertEqual(stop-start, record['count']) # count represents how many base pair reads intersected
                                                              # the region

            with open(matrix_path, 'r') as matrix_file:
               matrix_lines = matrix_file.readlines()
               self.assertEqual(2, len(matrix_lines))

               header_cols = matrix_lines[0].split('\t')
               self.assertEqual(3, len(header_cols))
               self.assertEqual('GENE_ID', header_cols[0])
               self.assertEqual('locusLine', header_cols[1])
               self.assertEqual('bin_1_%s\n' % 'single.bam', header_cols[2])

               data_cols = matrix_lines[1].split('\t')
               self.assertEqual(3, len(data_cols))
               if number_of_columns >= 4:
                   self.assertEqual(region_name, data_cols[0])
               else:
                   self.assertEqual("", data_cols[0])
               self.assertEqual('chr1(.):1-8', data_cols[1]) # todo: don't hardcode these values 
               self.assertEqual('1000000.0\n', data_cols[2])