Exemple #1
0
 def setUp(self):
     # get some data;
     self.data1 = glbase3.genelist(filename="test_data/testA.csv",
                                   format={
                                       'loc': 0,
                                       'name': 1,
                                       'score': 2,
                                       'skiplines': 0
                                   })
     self.data2 = glbase3.genelist(filename="test_data/testB.csv",
                                   format={
                                       'loc': 0,
                                       'name': 1
                                   })
     self.data3 = glbase3.genelist(filename="test_data/testC.csv",
                                   format={
                                       'loc': 0,
                                       'name': 1
                                   })
     #self.data4 = glbase3.genelist(filename="test_data/ccat_list.region", format=glbase3.format_ccat_output)
     print(self.data1)
     self.g = glbase3.glglob(self.data1,
                             self.data2,
                             self.data3,
                             type="peaklist")
Exemple #2
0
    def setUp(self):
        self.a = glbase3.genelist(filename="test_data/testA.csv",
                                  format=glbase3.format.sniffer)
        self.b = glbase3.genelist(filename="test_data/testB.csv",
                                  format=glbase3.format.sniffer)
        self.c = glbase3.genelist(filename="test_data/testC.csv",
                                  format=glbase3.format.sniffer)
        self.d = glbase3.genelist(filename="test_data/ccat_list.region",
                                  format=glbase3.format.ccat_output)
        self.e = glbase3.genelist(filename="test_data/macs_list.xls",
                                  format=glbase3.format.macs_output)

        fake1 = [{
            "name": "gene1"
        }, {
            "name": "gene2"
        }, {
            "name": "gene3"
        }, {
            "name": "gene4"
        }, {
            "name": "gene5"
        }]
        self.f1 = glbase3.genelist()
        self.f1.load_list(fake1)

        fake2 = [{
            "name": "gene4",
            "alt_key": "meh"
        }, {
            "name": "gene5"
        }, {
            "name": "gene6"
        }, {
            "name": "gene7"
        }, {
            "name": "gene8"
        }, {
            "name": "gene9"
        }]
        self.f2 = glbase3.genelist()
        self.f2.load_list(fake2)

        fake3 = [{
            "name": "gene4",
            "alt_key": False
        }, {
            "name": "gene4",
            "alt_key": True
        }, {
            "name": "gene5",
            "alt_key": False
        }, {
            "name": "gene6"
        }, {
            "name": "gene7"
        }]
        self.f3 = glbase3.genelist()
        self.f3.load_list(fake3)
Exemple #3
0
    def test_save_fasta(self):
        genome_mm10 = glbase3.genome()
        genome_mm10.bindSequence("test_data/seq")
        newl = [{
            "name": "A",
            "loc": glbase3.location(loc="chr1:100-150")
        }, {
            "name": "X",
            "loc": glbase3.location(loc="chrA:100-150")
        }]
        newgl = glbase3.genelist()
        newgl.load_list(newl)
        fasta = genome_mm10.getSequences(newgl)
        fasta.saveFASTA(filename="/tmp/test_fasta.fa", name=["loc", "name"])

        with open("/tmp/test_fasta.fa") as oh:
            self.assertEqual(oh.readline().strip(), '>chr1:100-150_A')
            self.assertEqual(
                oh.readline().strip(),
                'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC')
            self.assertEqual(oh.readline().strip(), '>chrA:100-150_X')
            self.assertEqual(
                oh.readline().strip(),
                'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
        fasta.saveFASTA(filename="/tmp/test_fasta.fa")
        with open("/tmp/test_fasta.fa") as oh:
            self.assertEqual(oh.readline().strip(), '>chr1:100-150')
            self.assertEqual(
                oh.readline().strip(),
                'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC')
            self.assertEqual(oh.readline().strip(), '>chrA:100-150')
            self.assertEqual(
                oh.readline().strip(),
                'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
Exemple #4
0
 def setUp(self):
     self.a = glbase3.delayedlist(
         filename="test_data/array_data.csv",
         format=format)  # although I don't actually need this at all.
     spoof_gl = [{"name": "Lypla1"}, {"name": "Pdia4"}]
     self.b = glbase3.genelist()
     self.b.load_list(spoof_gl)
Exemple #5
0
    def test_remove_dupes_by_loc(self):
        data = [{
            "loc": glbase3.location(loc="chr1:1000-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1000-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1100-1200")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1300-1400")
        }, {
            "loc": glbase3.location(loc="chr1:1600-1600")
        }, {
            "loc": glbase3.location(loc="chr1:1423-1423")
        }, {
            "loc": glbase3.location(loc="chr2:1000-1200")
        }]

        g = glbase3.genelist()
        g.load_list(data)

        newl = g.removeDuplicatesByLoc(delta=100, mode='pointify_expand')

        self.assertEqual(len(newl), 4)
Exemple #6
0
    def test_sam_tophat_xs(self):
        newgl = gl.genelist("test_data/test.sam",
                            format=gl.format.sam_tophat_xs)

        self.assertEqual(newgl[0]["loc"], "chr1:3035081-3035081")
        self.assertEqual(
            newgl[0]["seq"],
            "AAACATTCCTGGGAACATCTTGACCATAAGATAAAGGGGACTGTGAAGACATAGCAGGGCTATATTATCTAAGTCAACACCATCTGGCCG"
        )
        self.assertEqual(newgl[0]["strand"], "+")
        self.assertEqual(newgl[1]["strand"], "-")

        # test it also works for a delayedlist, which is where I'd usually use it:
        newgl = gl.delayedlist("test_data/test.sam",
                               format=gl.format.sam_tophat_xs)

        for index, item in enumerate(newgl):
            #print item
            if index == 0:
                self.assertEqual(item["loc"], "chr1:3035081-3035081")
                self.assertEqual(
                    item["seq"],
                    "AAACATTCCTGGGAACATCTTGACCATAAGATAAAGGGGACTGTGAAGACATAGCAGGGCTATATTATCTAAGTCAACACCATCTGGCCG"
                )
                self.assertEqual(item["strand"], "+")
            elif index == 1:
                self.assertEqual(item["strand"], "-")
Exemple #7
0
    def test_pileup(self):
        t = gl.track(filename="/tmp/test_pileup.trk",
                     new=True,
                     name="Test Track")
        for i in [10, 10, 10, 10, 10, 10]:
            t.add_location(gl.location(chr="chr1", left=i, right=i + 5))
        t.finalise()

        g = gl.genelist(filename="test_data/track_test.bed",
                        format=gl.format.bed)
        L = t.pileup(genelist=g,
                     filename="test_images/test_output.png",
                     heatmap_filename="test_images/test_heatmap.png",
                     window_size=15,
                     bin_size=1,
                     respect_strand=True,
                     normalise=False,
                     read_extend=1,
                     raw_tag_filename="test_images/test_tags.tsv")

        expected_result = numpy.array([
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            3.,
            3.,
            3.,
            6.,
            6.,
            6.,
            6.,
            3.,
            3.,
            3.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
        ])
        # units are now reads per item in genelist

        #print L['pileup']
        #print expected_result

        self.assertTrue(False not in
                        [x == y for x, y in zip(L["pileup"], expected_result)])
Exemple #8
0
 def test_renameKey(self):
     self.a = glbase3.genelist(filename="test_data/array_data.csv",
                               format=glbase3.format.sniffer)
     newl = self.a.renameKey("name", "other-name")
     self.assertTrue("name" in self.a[0])
     self.assertTrue("other-name" not in self.a[0])
     self.assertTrue("name" not in newl[0])
     self.assertTrue("other-name" in newl[0])
Exemple #9
0
    def test_buckets(self):
        glbase3.config.bucket_size = 100  # change to a smaller value for testing purposes.

        g = glbase3.genelist()

        data = [
            {
                "loc": glbase3.location(loc="chr1:1000-1200")
            },
            {
                "loc": glbase3.location(loc="chr1:1200-1300")
            },
            {
                "loc": glbase3.location(loc="chr1:1200-1201")
            },
            {
                "loc": glbase3.location(loc="chr1:1300-1400")
            },
            {
                "loc": glbase3.location(loc="chr1:1400-1500")
            },
            {
                "loc": glbase3.location(loc="chr1:1500-1600")
            },
            {
                "loc": glbase3.location(loc="chr1:1600-1600")
            },  # point locs on edges of buckets
            {
                "loc": glbase3.location(loc="chr1:1423-1423")
            },  # point locs in middle of buckets
            {
                "loc": glbase3.location(loc="chr1:0-1500")
            }
        ]  # span much larger than bucket

        g.load_list(data)

        left_buck = int(
            (1299 - 1) /
            glbase3.config.bucket_size) * glbase3.config.bucket_size
        right_buck = int(
            (1788) / glbase3.config.bucket_size) * glbase3.config.bucket_size
        buckets_reqd = list(
            range(left_buck, right_buck + glbase3.config.bucket_size,
                  glbase3.config.bucket_size)
        )  # make sure to get the right spanning and left spanning sites

        loc_ids = set()
        if buckets_reqd:
            for buck in buckets_reqd:
                if buck in g.buckets["1"]:
                    loc_ids.update(g.buckets["1"][buck])  # unique ids

        self.assertSetEqual(loc_ids, set([0, 1, 2, 3, 4, 5, 6, 7, 8]))
        self.assertEqual(len(g.buckets), 1)
        self.assertEqual(len(g.buckets["1"]), 17)

        glbase3.config.bucket_size = 10000  # change it back
Exemple #10
0
 def test_pileup_no_respect_strand(self):
     t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f")
     
     g = glbase3.genelist(filename="test_data/track_test.bed", format=glbase3.format.bed).pointify().expand('loc', 15)
     L, _ = t.pileup(genelists=g, filename="test_images/test_output_no_strand.png", bandwidth=15, respect_strand=False)
     
     expected_result = numpy.array(list(range(0, 30)))
     
     self.assertListEqual(list(L['track_test']), list(expected_result))
Exemple #11
0
 def test_tsv_sniffer_force(self):
     # These are all tsv files
     # sniffer correctly loads locations.
     a = gl.genelist(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=gl.format.sniffer)
     d = gl.genome(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=gl.format.sniffer)
     # Microarrays and delayedlists can't be sniffed
     
     # Make sure glbase is not just bodging it all in in one key:
     self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"])
     self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"])
Exemple #12
0
    def test_genelist_from_pandas(self):
        glb = gl.genelist()
        glb.from_pandas(self.df)

        self.assertEqual(len(glb), self.df.shape[0])
        self.assertSetEqual(set(glb[0].values()),
                            set(self.df.loc[[0]].values[0]))
        self.assertSetEqual(set(glb[1].values()),
                            set(self.df.loc[[1]].values[0]))
        self.assertSetEqual(set(glb[3].values()),
                            set(self.df.loc[[3]].values[0]))
Exemple #13
0
    def test_gzipped_delayedlist(self):
        self.b = glbase3.genelist(filename="test_data/array_data.csv",
                                  format=format)
        self.a = glbase3.delayedlist(filename="test_data/array_data.csv.gz",
                                     format=format,
                                     gzip=True)
        self.assertEqual(len(self.a), len(self.b))

        for item in self.a:
            self.assertEqual(item["name"], "Lypla1")
            self.assertEqual(item["array_systematic_name"], 'scl000965.1_10-S')
            break
Exemple #14
0
 def test_pileup_respect_strand(self):
     t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f")
     
     g = glbase3.genelist(filename="test_data/track_test.bed", format=glbase3.format.bed).pointify().expand('loc', 15)
     L, _ = t.pileup(genelists=g, filename="test_images/test_output_respect_strand.png", bandwidth=15, respect_strand=True)
     
     expected_result = numpy.zeros(30)
     expected_result += 14.5
     # IF respect strand is true then you will get 4 arrays --> --> and <-- <-- the average of the 4
     # will be 14.5 for all points. 
     # Note this is identical to the above test_draw_pielup()
     # except respect_strand=False
     self.assertListEqual(list(L['track_test']), list(expected_result))
Exemple #15
0
    def test_force_tsvarg(self):
        form = dict(tss_loc=1, skiplines=0) # This loads tss_loc as strings
        form_delayed = dict(tss_loc=1, skiplines=0) # delayedlists must have skiplines
        a = gl.genelist(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=form)
        c = gl.delayedlist(filename="test_data/mm9_refGene.tsv", format=form, force_tsv=True)
        d = gl.genome(filename="test_data/mm9_refGene.tsv", format=form_delayed, force_tsv=True)
        e = gl.expression(filename="test_data/mm9_refGene.tsv", format=form, force_tsv=True, expn="column[5:]") # fake array data # must go last as it modifies format

        # Make sure glbase is not just bodging it all in in one key:
        self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"])
        self.assertEqual("chr1:134212701-134212701", c[0]["tss_loc"])
        self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"]) # dls should work as __getitem__() will return the zeroth entry.
        self.assertEqual("chr1:134212701-134212701", e[0]["tss_loc"])
Exemple #16
0
 def test_removeDuplicatesByLoc_delete_any_matches(self):
     a = [
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=100, right=200)
         },
         {
             'loc': glbase3.location(chr=1, left=130, right=230)
         },
         {
             'loc': glbase3.location(chr=1, left=130, right=230)
         },
         {
             'loc': glbase3.location(chr=1, left=9800, right=9990)
         },  # across bucket
         {
             'loc': glbase3.location(chr=1, left=10001, right=10200)
         },
     ]
     gl = glbase3.genelist()
     gl.load_list(a)
     dups = gl.removeDuplicatesByLoc('pointify_expand',
                                     'loc',
                                     10,
                                     delete_any_matches=True)
     self.assertEqual(len(dups), 2)
     dups = gl.removeDuplicatesByLoc('overlap',
                                     'loc',
                                     0,
                                     delete_any_matches=True)
     self.assertEqual(len(dups), 2)
Exemple #17
0
 def test_load_FASTA_gzips(self):
     self.b = glbase3.genelist(filename="test_data/Fasta_file.fa.gz",
                               format=glbase3.format.fasta,
                               gzip=True)
     self.assertEqual(
         self.b[0]['seq'],
         'AAATctggatacagtggcctttatttctagttccagtgactgggagactgaaacaagagagtcacttgagtacaggagtgcaaggctagcttgagcaatatagtaagactatctcaaaaTGTGAATTtagatcaacagaattgacatcaagaaaaatactgatatcactcaaagcaatctacagattcaacacaatctccatcaacatgacaatgacttccatcaGCATGACAATGACTCCATCAACATGCCAATGGGCCCCATCAACATAACAATGACCCCTATCATCATGACAATGATCCCCATCAACATGACAATGACCTCCATCAACATGACAATTACTCCTGTCAACATGCCAATtgttggggttcagaagtcaccctgcaaaccacaagaacact'
     )
     self.assertEqual(self.b[0]['name'], 'loc_chr3:122137044-122138000_n1')
     self.assertEqual(
         self.b[1]['seq'],
         'CCCGTGAGCCCCTGCCGCACCCGCCGGTGTGCGGTTTAGCGCCGCGGTCAGTTGGGCCCTGGCGTTGTGTCGCGTCGGGAGCGTGTCCGCCTCGCGGCGGCTAGACGCGGGTGTCGCCGGGCTCCGACGGGTGGCCTATCCAGGGCTCGCCCCCGCCGTCCCCCGCCTGCCCGTCCCGGTGGTGGTCGTTGGTGTGGGGAGTGAATGGTGCTACCGGTCATTCCCTCCCGCGTGGTTTGACTGTCTCGCCGGTGTCGCGCTTCTCTTTCCGCCAACCCCCACGCCAACCCACCGCCCTGTGCTCCGCGCCCGGTGCGGTCGACGTTCCGGCTCTCCCGATGCCGAGGGGTTCGGGATTTGTGCCGGGGACGGAGGGGAGAGCGGATAAGAGAGGTGTCGGA'
     )
     self.assertEqual(self.b[1]['name'], 'loc_chr17:39450772-39457159_n2')
Exemple #18
0
   def test_force_tsv_format(self):
       form = dict(tss_loc=1, force_tsv=True, chr=1)
       form_delayed = dict(tss_loc=1, force_tsv=True, skiplines=0) # delayedlists must have skiplines
 
       a = gl.genelist(filename="test_data/mm9_refGene.tsv", format=form)
       c = gl.delayedlist(filename="test_data/mm9_refGene.tsv", format=form_delayed)
       d = gl.genome(filename="test_data/mm9_refGene.tsv", format=form)
       e = gl.expression(filename="test_data/mm9_refGene.tsv", format=form, expn="column[5:]") # must go last as it modifies format      
       
       # Make sure glbase is not just bodging it all in in one key:
       self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"])
       self.assertEqual("chr1:134212701-134212701", c[0]["tss_loc"])
       self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"])
       self.assertEqual("chr1:134212701-134212701", e[0]["tss_loc"])
Exemple #19
0
    def test_sort_location(self):
        a = [{
            'loc': location("chr1:1000-2000")
        }, {
            'loc': location("chr1:1001-2000")
        }, {
            'loc': location("chr1:1000-2001")
        }, {
            'loc': location("chr2:1050-2000")
        }, {
            'loc': location("chr1:999-2000")
        }]

        gl = glbase3.genelist()
        gl.load_list(a)
        gl.sort('loc')

        self.assertEqual(str(gl[-1]['loc']), "chr2:1050-2000")
        self.assertEqual(str(gl[0]['loc']), "chr1:999-2000")
Exemple #20
0
 def test_motif_fasta_scan(self):
     fasta = [
         'ACcactcacccattgtaaAAAcCCCAaaaa', 
         'ACACCCATTGTc',
         'cagtgtCCtggcC',
         'CAGTCtCaTTgtC',
         'NNNNNNNNNNNNN',
         'GCGCGCGCGCGCGC']
     fasta = [{'seq': s} for s in fasta]
     fgl = gl.genelist()
     fgl.load_list(fasta)
     
     r = self.sox2.scan_sequences(fgl)
     #print(r.all())
     #r.saveTSV('')
     f = r.get(self.sox2.name, 'Found') # Found seqs only;
     #print(f)
     self.assertEqual(len(r), 6)
     self.assertEqual(len(f), 3)
     #print(f)
     self.assertDictEqual(f[0], {'seq': 'ACcactcacccattgtaaAAAcCCCAaaaa', 'Sox2': 'Found', 'Sox2_seqs': 'cattgta'})
Exemple #21
0
 def test_load_gzips(self):
     self.b = glbase3.genelist(filename="test_data/array_data.csv.gz",
                               format=glbase3.format.sniffer,
                               gzip=True)
     self.assertDictEqual(
         self.b[-1], {
             'name': 'Pdia4',
             'GFP': 1.18,
             'Mash': 0.6,
             'array_systematic_name': 'scl29051.11.1_27-S',
             'refseq': 'NM_009787',
             'entrez': 12304
         })
     self.assertDictEqual(
         self.b[2], {
             'name': 'Srpr',
             'GFP': 1,
             'Mash': 0.77,
             'array_systematic_name': 'scl0067398.1_126-S',
             'refseq': 'NM_026130',
             'entrez': 67398
         })
Exemple #22
0
    def test_chip_seq_cluster_heatmap_error(self):
        no_loc_gl = glbase3.genelist()
        no_loc_gl.load_list([{
            'name': 'missing'
        }, {
            'name': 'a'
        }, {
            'name': 'loc'
        }, {
            'name': 'key'
        }])

        self.assertRaises(
            ValueError, self.g.chip_seq_cluster_heatmap,
            [self.data1, self.data2, self.data3],
            [])  # Fails at a differnet stage, but passes the assertion
        self.assertRaises(glbase3.errors.AssertionError,
                          self.g.chip_seq_cluster_heatmap,
                          [self.data1, self.data2, no_loc_gl], [])
        self.assertRaises(glbase3.errors.AssertionError,
                          self.g.chip_seq_cluster_heatmap,
                          [self.data1, no_loc_gl, no_loc_gl], [])
Exemple #23
0
    def setUp(self):
        data = [{
            "name": "Nanog",
            "items": 2
        }, {
            "name": "Nanog",
            "items": 3
        }, {
            "name": "Nanog",
            "items": 4
        }, {
            "name": "Pou5f1",
            "items": 5
        }, {
            "name": "Pou5f1",
            "items": 6
        }, {
            "name": "Sox2",
            "items": 7
        }]

        self.a = gl.genelist()
        self.a.load_list(data)
Exemple #24
0
 def setUp(self):
     self.a = glbase3.genelist(filename="test_data/array_data.csv",
                               format=glbase3.format.sniffer)
Exemple #25
0
 def setUp(self):
     self.a = glbase3.genelist(filename="test_data/testA.csv",
                               format=glbase3.format.sniffer)
     self.g = glbase3.genome(filename="test_data/test-genome.csv",
                             format=glbase3.format.sniffer)
Exemple #26
0
print(pd.DataFrame(adata.uns['rank_genes_groups']['names']))

print()
topall = pd.DataFrame(adata.uns['rank_genes_groups']['names'])  # get all;
fcs = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges'])
padj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj'])

topall.to_csv('top100.csv')

# Go through and trim the TEs:

TEs = set(
    genelist(filename='../../TE_genes_id.mm10.txt',
             format={
                 'name': 0,
                 'force_tsv': True
             })['name'])

newcols = {}

groups = list(topall.columns.values)

for group in groups:
    newcols[group] = []

    t = zip(
        [i[group] for i in adata.uns['rank_genes_groups']['names']],
        [i[group] for i in adata.uns['rank_genes_groups']['logfoldchanges']],
        [i[group] for i in adata.uns['rank_genes_groups']['pvals_adj']])
Exemple #27
0
"""

This is the code from the publication.

relative paths were removed in the text for clarity.

"""

import glbase3 as gl

peaks = gl.genelist(filename="../shared_raw_data/macs_list.xls.gz",
                    format=gl.format.macs_summit,
                    gzip=True)
bed = gl.genelist(filename="../shared_raw_data/Sox2_Oct4_ol_w100_annot.bed.gz",
                  format=gl.format.bed,
                  gzip=True)
fasta = gl.genelist(filename="../shared_raw_data/Fasta_file.fa.gz",
                    format=gl.format.fasta,
                    gzip=True)

print(peaks)

peaks = peaks[0:1]  # list was truncated for clarity:
print(peaks.pointify())  # Take the middle point of the interval
print(peaks.expand("loc", 100))  # expand the left and right border by 100 bp

print(bed)

print(fasta)
Exemple #28
0
    def measure_te_anchors(self):
        '''
        **Purpose**
            Make a crude measure of the
            TE <-> TE
            TE <-> -
            -  <-> -

            possible arrangements

        '''
        te = {'TE <-> TE': 0, 'TE <-> -': 0, '-  <-> -': 0}
        res_te_te = {}
        res_te_nn = {}

        # The format of the TE file is:
        # read1.chrom read1.left read1.right read1.labels read1.type read2.chrom read2.left read2.right read2.labels read2.type

        print("Measures anchors...")
        total = 0
        oh = gzip.open(self.filename, 'rt')
        for line in oh:
            r = line.strip().split('\t')
            # measure TE anchors
            if 'TE' in r[4] and 'TE' in r[9]:
                te['TE <-> TE'] += 1
            elif 'TE' in r[4] or 'TE' in r[9]:
                te['TE <-> -'] += 1
            else:
                te['-  <-> -'] += 1
            total += 1

            if total % 1000000 == 0:
                print('Processed: {:,}'.format(total))
                #break

            # Measure TEs in detail
            if 'TE' in r[4] and 'TE' in r[9]:
                # possible to have more than one TE:
                tel = [
                    i.strip() for i in r[3].split(',') if ':' in i
                ]  # can also hoover up some genes, so use ':' to discriminate TEs
                ter = [i.strip() for i in r[8].split(',') if ':' in i]
                combs = product(tel, ter)
                combs = [tuple(sorted(i))
                         for i in combs]  # sort to make it unidirectional

                combs = set(combs)
                for c in combs:
                    if c not in res_te_te:
                        res_te_te[c] = 0
                    res_te_te[c] += 1

            elif 'TE' in r[4] or 'TE' in r[9]:
                if 'TE' in r[4]:
                    TE = [i.strip() for i in r[3].split(',') if ':' in i]
                elif 'TE' in r[9]:
                    TE = [i.strip() for i in r[8].split(',') if ':' in i]
                for t in TE:
                    if ':' not in t:
                        continue
                    if t not in res_te_nn:
                        res_te_nn[t] = 0
                    res_te_nn[t] += 1

        oh.close()

        print('\nmeasure_te_anchors():')
        print('  TE <-> TE : {:,} ({:.2%})'.format(te['TE <-> TE'],
                                                   te['TE <-> TE'] / total))
        print('  TE <-> -- : {:,} ({:.2%})'.format(te['TE <-> -'],
                                                   te['TE <-> -'] / total))
        print('  -- <-> -- : {:,} ({:.2%})'.format(te['-  <-> -'],
                                                   te['-  <-> -'] / total))
        print()

        oh = open('%s_crude_measures.txt' % self.project_name, 'w')
        oh.write('TE <-> TE : {:,} ({:.5%})\n'.format(te['TE <-> TE'],
                                                      te['TE <-> TE'] / total))
        oh.write('TE <-> -- : {:,} ({:.5%})\n'.format(te['TE <-> -'],
                                                      te['TE <-> -'] / total))
        oh.write('-- <-> -- : {:,} ({:.5%})\n'.format(te['-  <-> -'],
                                                      te['-  <-> -'] / total))
        oh.close()

        oh_te_te = open('%s_te-te_anchor_frequencies.tsv' % self.project_name,
                        'w')
        oh_te_te.write('%s\n' % '\t'.join([
            'TE1', 'TE2', 'RPM', 'RPM per kbp TE', 'TE1_genome_freq',
            'TE2_genome_freq'
        ]))
        for k in sorted(list(res_te_te)):
            te1 = self.genome._findDataByKeyLazy('name', k[0])
            te2 = self.genome._findDataByKeyLazy('name', k[1])
            rpm = res_te_te[k] / total * 1e6
            joint_kb_size = te1['genome_count'] + te2['genome_count']
            rpmpkbte = (rpm / joint_kb_size) * 1e3
            line = {
                'te1': k[0],
                'te2': k[1],
                'rpm': rpm,
                'rpmpkbte': rpmpkbte,
                'te1_genome_freq':
                te1['genome_percent'] / 100.0,  # Convert back to fraction;
                'te2_genome_freq': te2['genome_percent'] / 100.0,
                #'enrichment': #!?!?!
            }
            oh_te_te.write(
                '{i[te1]}\t{i[te2]}\t{i[rpm]}\t{i[rpmpkbte]}\t{i[te1_genome_freq]}\t{i[te2_genome_freq]}\n'
                .format(i=line))
            #print('{i[te1]}\t{i[te2]}\t{rpm}\t{rpmkbte}\t{te1_genome_freq}\t{te2_genome_freq}\n'.format(i=line))
        oh_te_te.close()

        te_nn = glbase3.genelist()
        te_nn.load_list([{
            'name': k,
            'count': res_te_nn[k]
        } for k in res_te_nn])
        te_nn = te_nn.map(genelist=self.genome, key='name')
        for te in te_nn:
            te['RPM'] = (res_te_nn[te['name']] / total) * 1e6
            te['RPM per kbp of TE'] = (te['RPM'] / te['genome_count']) * 1e3
            #te['enrichment'] =
        te_nn._optimiseData()
        te_nn.sort('name')
        te_nn.saveTSV('%s_te-nn_anchor_frequencies.tsv' % self.project_name,
                      key_order=[
                          'name', 'count', 'genome_count', 'genome_percent',
                          'RPM'
                      ])

        return
Exemple #29
0
 def test_len(self):
     self.b = glbase3.genelist(filename="test_data/array_data.csv",
                               format=format)
     self.a = glbase3.delayedlist(filename="test_data/array_data.csv",
                                  format=format)
     self.assertEqual(len(self.a), len(self.b))
Exemple #30
0
    newl = []
    for k in keys:
        d = get_details(k)
        writer.writerow([d[k] for k in key_order])
        newl.append(d)
    oh.close()

    print()
    print("\n>>>Citation:")
    gses = list(gse_to_reference.keys())
    gses.sort()
    r = ", ".join(["%s (%s:%s)" % (gse, gse_to_reference[gse], pmid_to_gse[gse]) for gse in gses])
    print(r)
    print()

    gl = glbase3.genelist()
    gl.load_list(newl)
    gl.save("sample_map.glb")

    print()
    print("Number of studies in domain:")
    res = {}
    for k in gene_layer_name:
        res[k] = []
        for ct in gene_layer_name[k]:
            res[k] += sample_to_gse[ct]

        res[k] = set(res[k])

    for k in sorted(res):
        print(k, len(res[k]))