Beispiel #1
0
    def test_save_fasta(self):
        genome_mm10 = glbase3.genome()
        genome_mm10.bindSequence("test_data/seq")
        newl = [{
            "name": "A",
            "loc": glbase3.location(loc="chr1:100-150")
        }, {
            "name": "X",
            "loc": glbase3.location(loc="chrA:100-150")
        }]
        newgl = glbase3.genelist()
        newgl.load_list(newl)
        fasta = genome_mm10.getSequences(newgl)
        fasta.saveFASTA(filename="/tmp/test_fasta.fa", name=["loc", "name"])

        with open("/tmp/test_fasta.fa") as oh:
            self.assertEqual(oh.readline().strip(), '>chr1:100-150_A')
            self.assertEqual(
                oh.readline().strip(),
                'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC')
            self.assertEqual(oh.readline().strip(), '>chrA:100-150_X')
            self.assertEqual(
                oh.readline().strip(),
                'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
        fasta.saveFASTA(filename="/tmp/test_fasta.fa")
        with open("/tmp/test_fasta.fa") as oh:
            self.assertEqual(oh.readline().strip(), '>chr1:100-150')
            self.assertEqual(
                oh.readline().strip(),
                'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC')
            self.assertEqual(oh.readline().strip(), '>chrA:100-150')
            self.assertEqual(
                oh.readline().strip(),
                'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
Beispiel #2
0
 def setUp(self):
     self.gsql = glbase3.genome_sql(
         new=True, filename='/tmp/test_genome_sql.sql'
     )  # This is platform specific and breaks on Windows
     self.gsql.add_feature(
         glbase3.location(chr='chr1', left=110, right=120),
         glbase3.location(chr='chr1', left=110, right=120), 10,
         [1, 2, 3, 4], [5, 6, 7, 8], 'Nanog', '+')
Beispiel #3
0
 def test_mask(self):
     t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f")
     a = t.get(glbase3.location(loc="chr2:99-111"))
     unmasked = [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,2]
     self.assertTrue(False not in [int(x) == int(y) for x, y in zip(a, unmasked)]) # all seqs.
     a = t.get(glbase3.location(loc="chr2:99-111"), mask_zero=True)
     expected = "[2.0 -- -- -- -- -- -- -- -- -- -- --]" # not sure how to test this apart from a string.
     self.assertEqual(str(a), expected)
Beispiel #4
0
 def test_read_extend(self):
     a = self.t.get(gl.location(loc="chr1:10-20"), read_extend=1)
     self.assertListEqual(
         list(a), [5., 5., 5., 4., 4., 5., 5., 4., 4., 5., 5.
                   ])  # These are correct. Always returns floats now
     a = self.t.get(gl.location(loc="chr1:10-20"), read_extend=2)
     self.assertListEqual(list(a),
                          [5., 5., 5., 5., 4., 5., 5., 5., 4., 5., 5.])  # .
Beispiel #5
0
    def test_remove_dupes_by_loc(self):
        data = [{
            "loc": glbase3.location(loc="chr1:1000-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1000-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1100-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1600-1600")
        }, {
            "loc": glbase3.location(loc="chr1:1423-1423")
        }, {
            "loc": glbase3.location(loc="chr2:1000-1200")
        }]

        g = glbase3.genelist()
        g.load_list(data)

        newl = g.removeDuplicatesByLoc(delta=100, mode='pointify_expand')

        self.assertEqual(len(newl), 4)
Beispiel #6
0
def generate_track_frags_only(filename, norm_factor):
    t = gl.track(filename=filename,
                 new=True,
                 name="Test Track",
                 norm_factor=norm_factor)
    t.add_location(gl.location(loc="chr1:10-20"))
    t.add_location(gl.location(loc="chr1:10-21"))  # test duplicates
    t.add_location(gl.location(loc="chr1:10-22"))  # test 1's
    t.add_location(gl.location(loc="chr1:9-23"))  # test 1's
    t.finalise()
    return (t)
Beispiel #7
0
    def test_buckets(self):
        glbase3.config.bucket_size = 100  # change to a smaller value for testing purposes.

        g = glbase3.genelist()

        data = [
            {
                "loc": glbase3.location(loc="chr1:1000-1200")
            },
            {
                "loc": glbase3.location(loc="chr1:1200-1300")
            },
            {
                "loc": glbase3.location(loc="chr1:1200-1201")
            },
            {
                "loc": glbase3.location(loc="chr1:1300-1400")
            },
            {
                "loc": glbase3.location(loc="chr1:1400-1500")
            },
            {
                "loc": glbase3.location(loc="chr1:1500-1600")
            },
            {
                "loc": glbase3.location(loc="chr1:1600-1600")
            },  # point locs on edges of buckets
            {
                "loc": glbase3.location(loc="chr1:1423-1423")
            },  # point locs in middle of buckets
            {
                "loc": glbase3.location(loc="chr1:0-1500")
            }
        ]  # span much larger than bucket

        g.load_list(data)

        left_buck = int(
            (1299 - 1) /
            glbase3.config.bucket_size) * glbase3.config.bucket_size
        right_buck = int(
            (1788) / glbase3.config.bucket_size) * glbase3.config.bucket_size
        buckets_reqd = list(
            range(left_buck, right_buck + glbase3.config.bucket_size,
                  glbase3.config.bucket_size)
        )  # make sure to get the right spanning and left spanning sites

        loc_ids = set()
        if buckets_reqd:
            for buck in buckets_reqd:
                if buck in g.buckets["1"]:
                    loc_ids.update(g.buckets["1"][buck])  # unique ids

        self.assertSetEqual(loc_ids, set([0, 1, 2, 3, 4, 5, 6, 7, 8]))
        self.assertEqual(len(g.buckets), 1)
        self.assertEqual(len(g.buckets["1"]), 17)

        glbase3.config.bucket_size = 10000  # change it back
Beispiel #8
0
 def test_reload(self):
     t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f")
     a = self.t.get(glbase3.location(loc="chr1:0-100"))
     expected_result = [i for i in range(100)] + [0]
     self.assertTrue(False not in [
         int(x) == int(y) for x, y in zip(a, expected_result)
     ])  # all seqs.
Beispiel #9
0
 def test_read_extend_frags_only(self):
     a = self.frags.get(gl.location(loc="chr1:5-25"), read_extend=1)
     self.assertTrue(False not in [
         i == e for i, e in zip(a, [
             0, 0, 0, 0, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 2, 1, 0
         ])
     ])
Beispiel #10
0
    def test_pileup(self):
        t = gl.track(filename="/tmp/test_pileup.trk",
                     new=True,
                     name="Test Track")
        for i in [10, 10, 10, 10, 10, 10]:
            t.add_location(gl.location(chr="chr1", left=i, right=i + 5))
        t.finalise()

        g = gl.genelist(filename="test_data/track_test.bed",
                        format=gl.format.bed)
        L = t.pileup(genelist=g,
                     filename="test_images/test_output.png",
                     heatmap_filename="test_images/test_heatmap.png",
                     window_size=15,
                     bin_size=1,
                     respect_strand=True,
                     normalise=False,
                     read_extend=1,
                     raw_tag_filename="test_images/test_tags.tsv")

        expected_result = numpy.array([
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            3.,
            3.,
            3.,
            6.,
            6.,
            6.,
            6.,
            3.,
            3.,
            3.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
        ])
        # units are now reads per item in genelist

        #print L['pileup']
        #print expected_result

        self.assertTrue(False not in
                        [x == y for x, y in zip(L["pileup"], expected_result)])
Beispiel #11
0
    def test_norm_factor(self):
        n1 = generate_track(filename="/tmp/test_1.0.trk",
                            norm_factor=1.0)  # Test silent argument
        actual = self.t.get(gl.location(loc="chr1:10-20"), read_extend=1)
        observed = n1.get(gl.location(loc="chr1:10-20"), read_extend=1)
        self.assertTrue(
            False not in [x == y for x, y in zip(actual, observed)])

        n2 = generate_track(filename="/tmp/test_2.0.trk", norm_factor=2.0)
        observed = n2.get(gl.location(loc="chr1:10-20"), read_extend=1)
        self.assertTrue(
            False not in [x == y for x, y in zip(actual * 2.0, observed)])

        n3 = generate_track(filename="/tmp/test_0.5.trk", norm_factor=0.5)
        observed = n3.get(gl.location(loc="chr1:10-20"), read_extend=1)
        self.assertTrue(
            False not in [x == y for x, y in zip(actual * 0.5, observed)])
Beispiel #12
0
 def test_reload(self):
     t = gl.track(filename="/tmp/test.trk")  # uses self.t
     # list of left and rights to compare
     expected_reads = [(10, 20, '+'), (10, 20, '+'), (21, 22, '+'),
                       (23, 23, '+'), (1, 100, '+'), (9, 19, '+'),
                       (15, 15, '+'), (19, 25, '+'), (5, 11, '+')]
     reads = self.t.get_reads(gl.location(loc="chr1:1-100"))
     res = [(r[li], r[ri], r[st]) in expected_reads for r in reads]
     self.assertTrue(False not in res)
     self.assertTrue(len(expected_reads) == len(reads))
Beispiel #13
0
    def test_get_reads(self):
        # Test a range of reads and edges.
        expected_reads = [(10, 20, '+'), (10, 20, '+'), (21, 22, '+'),
                          (23, 23, '+'), (1, 100, '+'), (9, 19, '+'),
                          (15, 15, '+'), (19, 25, '+'), (5, 11, '+')]
        reads = self.t.get_reads(gl.location(loc="chr1:1-100"))
        res = [(r[li], r[ri], r[st]) in expected_reads for r in reads]
        self.assertTrue(False not in res)
        self.assertTrue(len(expected_reads) == len(reads))

        # Test an inner get for a long read
        expected_reads = [(1, 1000, '+')]
        reads = self.t.get_reads(gl.location(loc="chrX:10-20"))
        res = [(r[li], r[ri], r[st]) in expected_reads for r in reads
               ]  # Can't test chrom, track1 does not return chrom
        self.assertTrue(False not in res)
        self.assertTrue(len(expected_reads) == len(reads))

        expected_reads = [(1, 2, '-')]
        reads = self.t.get_reads(gl.location(loc="chr3:1-5"))
        res = [(r[li], r[ri], r[st]) in expected_reads for r in reads
               ]  # Can't test chrom, track1 does not return chrom
        self.assertTrue(False not in res)
        self.assertTrue(len(expected_reads) == len(reads))

        reads = self.t.get_reads(
            gl.location(loc="chr3:50-100"))  # empty region test
        self.assertTrue(len(reads) == 0)

        expected_reads = [(10, 20, '+'), (10, 20, '+'), (1, 100, '+'),
                          (9, 19, '+'), (5, 11, '+'), (19, 25, '+'),
                          (15, 15, '+')]
        reads = self.t.get_reads(
            gl.location(loc="chr1:10-20"))  #  overspanning read test
        res = [(r[li], r[ri], r[st]) in expected_reads for r in reads
               ]  # Can't test chrom, track1 does not return chrom
        self.assertTrue(False not in res)
        self.assertTrue(len(expected_reads) == len(reads))

        expected_reads = [(15, 15, '+'), (10, 20, '+'), (10, 20, '+'),
                          (1, 100, '+'), (9, 19, '+')]
        reads = self.t.get_reads(
            gl.location(loc="chr1:15-15"))  #  single point location read test
        res = [(r[li], r[ri], r[st]) in expected_reads for r in reads
               ]  # Can't test chrom, track1 does not return chrom
        self.assertTrue(False not in res)
        self.assertTrue(len(expected_reads) == len(reads))
Beispiel #14
0
 def test_get_array(self):
     a = self.t.get(gl.location(loc="chr1:10-20"))
     self.assertListEqual(list(a),
                          [5., 5., 4., 4., 4., 5., 4., 4., 4., 5., 4.])
Beispiel #15
0
 def test_removeDuplicatesByLoc_delete_any_matches(self):
     a = [
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=130, right=230)
         },
         {
             'loc': glbase3.location(chr=1, left=130, right=230)
         },
         {
             'loc': glbase3.location(chr=1, left=9800, right=9990)
         },  # across bucket
         {
             'loc': glbase3.location(chr=1, left=10001, right=10200)
         },
     ]
     gl = glbase3.genelist()
     gl.load_list(a)
     dups = gl.removeDuplicatesByLoc('pointify_expand',
                                     'loc',
                                     10,
                                     delete_any_matches=True)
     self.assertEqual(len(dups), 2)
     dups = gl.removeDuplicatesByLoc('overlap',
                                     'loc',
                                     0,
                                     delete_any_matches=True)
     self.assertEqual(len(dups), 2)
Beispiel #16
0
def generate_track(filename, norm_factor):
    t = gl.track(filename=filename,
                 new=True,
                 name="Test Track",
                 norm_factor=norm_factor)
    t.add_location(gl.location(loc="chr1:10-20"))
    t.add_location(gl.location(loc="chr1:10-20"))  # test duplicates
    t.add_location(gl.location(loc="chr1:21-22"))  # test 1's
    t.add_location(gl.location(loc="chr1:23-23"))  # test single outside
    t.add_location(gl.location(loc="chr1:1-100"))  # test massive span
    t.add_location(gl.location(loc="chr1:9-19"))  # inside test
    t.add_location(gl.location(loc="chr1:15-15"))  # 1bp inside
    t.add_location(gl.location(loc="chr1:19-25"))  # over right border
    t.add_location(gl.location(loc="chr1:5-11"))  # over left border
    t.add_location(gl.location(loc="chrX:1-1000"))  # letter chromsome
    t.add_location(gl.location(loc="chr2:2-2000"))  # other numeric chr
    t.add_location(gl.location(loc="chr3:1-2"), strand="-")  # test strand
    t.add_location(gl.location(loc="chr10:1-2"), strand="-")  # test strand
    t.add_location(gl.location(loc="chr100:1-2"), strand="-")  # test strand
    t.finalise()
    return (t)
Beispiel #17
0
    def load_bed(self, filename, out_filename, expand_bed=0):
        '''
        **Purpose**
            Load in a BED file, ideally output by collect_valid_pairs.py,
            although I guess any valid BED will do

            This function is not officially part of te_hic, but could be useful to annotate
            (for example) a BED list of peaks from a ChIP-seq

        **Arguments**
            filename (Required)
                filename of the BED file

            expand_bed (Optional, default=0)
                Optionally expand the BED coordianted left and right by expand_bed

        '''
        assert filename, 'You must specify a filename'

        done = 0
        bucket_size = glbase3.config.bucket_size

        output = []

        oh = open(filename, 'r')
        for idx, line in enumerate(oh):
            line = line.strip().split('\t')

            # reach into the genelist guts...
            # work out which of the buckets is required:
            loc = glbase3.location(chr=line[0], left=int(line[1])-expand_bed, right=int(line[2])+expand_bed)
            left_buck = ((loc["left"]-1)//bucket_size) * bucket_size
            right_buck = ((loc["right"])//bucket_size) * bucket_size
            buckets_reqd = range(left_buck, right_buck+bucket_size, bucket_size)
            result = []
            # get the ids reqd.
            loc_ids = set()
            if buckets_reqd:
                for buck in buckets_reqd:
                    if buck in self.genome.buckets[loc["chr"]]:
                        loc_ids.update(self.genome.buckets[loc["chr"]][buck]) # set = unique ids

                for index in loc_ids:
                    #print loc.qcollide(self.linearData[index]["loc"]), loc, self.linearData[index]["loc"]
                    if loc.qcollide(self.genome.linearData[index]["loc"]):
                        result.append(self.genome.linearData[index])

                read1_feat = []
                read1_type = []
                if result:
                    for r in result:
                        read1_feat.append(r['name'])
                        read1_type.append(r['type'])

            if read1_feat:
                read1_feat = ', '.join(set(read1_feat))
                read1_type = ', '.join(set(read1_type))
            else:
                read1_feat = 'None'
                read1_type = 'None'

            output.append('\t'.join(line[0:3] + [read1_feat, read1_type]))

            #print(output[-1])

            done += 1

            if done % 1000000 == 0:
                print('Processed: {:,}'.format(done))
                #break

        print('Processed {:,} reads'.format(len(output)))
        oh.close()

        out = open(out_filename, 'w')
        for o in output:
            out.write('%s\n' % o)
        out.close()
Beispiel #18
0
 def test_get(self):
     a = self.t.get(glbase3.location(loc="chr1:10-20"))
     self.assertListEqual(list(a), [10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0])