def setUp(self): # get some data; self.data1 = glbase3.genelist(filename="test_data/testA.csv", format={ 'loc': 0, 'name': 1, 'score': 2, 'skiplines': 0 }) self.data2 = glbase3.genelist(filename="test_data/testB.csv", format={ 'loc': 0, 'name': 1 }) self.data3 = glbase3.genelist(filename="test_data/testC.csv", format={ 'loc': 0, 'name': 1 }) #self.data4 = glbase3.genelist(filename="test_data/ccat_list.region", format=glbase3.format_ccat_output) print(self.data1) self.g = glbase3.glglob(self.data1, self.data2, self.data3, type="peaklist")
def setUp(self): self.a = glbase3.genelist(filename="test_data/testA.csv", format=glbase3.format.sniffer) self.b = glbase3.genelist(filename="test_data/testB.csv", format=glbase3.format.sniffer) self.c = glbase3.genelist(filename="test_data/testC.csv", format=glbase3.format.sniffer) self.d = glbase3.genelist(filename="test_data/ccat_list.region", format=glbase3.format.ccat_output) self.e = glbase3.genelist(filename="test_data/macs_list.xls", format=glbase3.format.macs_output) fake1 = [{ "name": "gene1" }, { "name": "gene2" }, { "name": "gene3" }, { "name": "gene4" }, { "name": "gene5" }] self.f1 = glbase3.genelist() self.f1.load_list(fake1) fake2 = [{ "name": "gene4", "alt_key": "meh" }, { "name": "gene5" }, { "name": "gene6" }, { "name": "gene7" }, { "name": "gene8" }, { "name": "gene9" }] self.f2 = glbase3.genelist() self.f2.load_list(fake2) fake3 = [{ "name": "gene4", "alt_key": False }, { "name": "gene4", "alt_key": True }, { "name": "gene5", "alt_key": False }, { "name": "gene6" }, { "name": "gene7" }] self.f3 = glbase3.genelist() self.f3.load_list(fake3)
def test_save_fasta(self): genome_mm10 = glbase3.genome() genome_mm10.bindSequence("test_data/seq") newl = [{ "name": "A", "loc": glbase3.location(loc="chr1:100-150") }, { "name": "X", "loc": glbase3.location(loc="chrA:100-150") }] newgl = glbase3.genelist() newgl.load_list(newl) fasta = genome_mm10.getSequences(newgl) fasta.saveFASTA(filename="/tmp/test_fasta.fa", name=["loc", "name"]) with open("/tmp/test_fasta.fa") as oh: self.assertEqual(oh.readline().strip(), '>chr1:100-150_A') self.assertEqual( oh.readline().strip(), 'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC') self.assertEqual(oh.readline().strip(), '>chrA:100-150_X') self.assertEqual( oh.readline().strip(), 'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC') fasta.saveFASTA(filename="/tmp/test_fasta.fa") with open("/tmp/test_fasta.fa") as oh: self.assertEqual(oh.readline().strip(), '>chr1:100-150') self.assertEqual( oh.readline().strip(), 'ATCAGACAGGTAGATCATCTCGCTCCGAGCTTGCCACCAGCAAACCATTGC') self.assertEqual(oh.readline().strip(), '>chrA:100-150') self.assertEqual( oh.readline().strip(), 'GTAAAAACCCGATGGAATACTCATCCAGTAAGTCCGAACCACTTCAACATC')
def setUp(self): self.a = glbase3.delayedlist( filename="test_data/array_data.csv", format=format) # although I don't actually need this at all. spoof_gl = [{"name": "Lypla1"}, {"name": "Pdia4"}] self.b = glbase3.genelist() self.b.load_list(spoof_gl)
def test_remove_dupes_by_loc(self): data = [{ "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1100-1200") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1600-1600") }, { "loc": glbase3.location(loc="chr1:1423-1423") }, { "loc": glbase3.location(loc="chr2:1000-1200") }] g = glbase3.genelist() g.load_list(data) newl = g.removeDuplicatesByLoc(delta=100, mode='pointify_expand') self.assertEqual(len(newl), 4)
def test_sam_tophat_xs(self): newgl = gl.genelist("test_data/test.sam", format=gl.format.sam_tophat_xs) self.assertEqual(newgl[0]["loc"], "chr1:3035081-3035081") self.assertEqual( newgl[0]["seq"], "AAACATTCCTGGGAACATCTTGACCATAAGATAAAGGGGACTGTGAAGACATAGCAGGGCTATATTATCTAAGTCAACACCATCTGGCCG" ) self.assertEqual(newgl[0]["strand"], "+") self.assertEqual(newgl[1]["strand"], "-") # test it also works for a delayedlist, which is where I'd usually use it: newgl = gl.delayedlist("test_data/test.sam", format=gl.format.sam_tophat_xs) for index, item in enumerate(newgl): #print item if index == 0: self.assertEqual(item["loc"], "chr1:3035081-3035081") self.assertEqual( item["seq"], "AAACATTCCTGGGAACATCTTGACCATAAGATAAAGGGGACTGTGAAGACATAGCAGGGCTATATTATCTAAGTCAACACCATCTGGCCG" ) self.assertEqual(item["strand"], "+") elif index == 1: self.assertEqual(item["strand"], "-")
def test_pileup(self): t = gl.track(filename="/tmp/test_pileup.trk", new=True, name="Test Track") for i in [10, 10, 10, 10, 10, 10]: t.add_location(gl.location(chr="chr1", left=i, right=i + 5)) t.finalise() g = gl.genelist(filename="test_data/track_test.bed", format=gl.format.bed) L = t.pileup(genelist=g, filename="test_images/test_output.png", heatmap_filename="test_images/test_heatmap.png", window_size=15, bin_size=1, respect_strand=True, normalise=False, read_extend=1, raw_tag_filename="test_images/test_tags.tsv") expected_result = numpy.array([ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 3., 3., 6., 6., 6., 6., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ]) # units are now reads per item in genelist #print L['pileup'] #print expected_result self.assertTrue(False not in [x == y for x, y in zip(L["pileup"], expected_result)])
def test_renameKey(self): self.a = glbase3.genelist(filename="test_data/array_data.csv", format=glbase3.format.sniffer) newl = self.a.renameKey("name", "other-name") self.assertTrue("name" in self.a[0]) self.assertTrue("other-name" not in self.a[0]) self.assertTrue("name" not in newl[0]) self.assertTrue("other-name" in newl[0])
def test_buckets(self): glbase3.config.bucket_size = 100 # change to a smaller value for testing purposes. g = glbase3.genelist() data = [ { "loc": glbase3.location(loc="chr1:1000-1200") }, { "loc": glbase3.location(loc="chr1:1200-1300") }, { "loc": glbase3.location(loc="chr1:1200-1201") }, { "loc": glbase3.location(loc="chr1:1300-1400") }, { "loc": glbase3.location(loc="chr1:1400-1500") }, { "loc": glbase3.location(loc="chr1:1500-1600") }, { "loc": glbase3.location(loc="chr1:1600-1600") }, # point locs on edges of buckets { "loc": glbase3.location(loc="chr1:1423-1423") }, # point locs in middle of buckets { "loc": glbase3.location(loc="chr1:0-1500") } ] # span much larger than bucket g.load_list(data) left_buck = int( (1299 - 1) / glbase3.config.bucket_size) * glbase3.config.bucket_size right_buck = int( (1788) / glbase3.config.bucket_size) * glbase3.config.bucket_size buckets_reqd = list( range(left_buck, right_buck + glbase3.config.bucket_size, glbase3.config.bucket_size) ) # make sure to get the right spanning and left spanning sites loc_ids = set() if buckets_reqd: for buck in buckets_reqd: if buck in g.buckets["1"]: loc_ids.update(g.buckets["1"][buck]) # unique ids self.assertSetEqual(loc_ids, set([0, 1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(len(g.buckets), 1) self.assertEqual(len(g.buckets["1"]), 17) glbase3.config.bucket_size = 10000 # change it back
def test_pileup_no_respect_strand(self): t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f") g = glbase3.genelist(filename="test_data/track_test.bed", format=glbase3.format.bed).pointify().expand('loc', 15) L, _ = t.pileup(genelists=g, filename="test_images/test_output_no_strand.png", bandwidth=15, respect_strand=False) expected_result = numpy.array(list(range(0, 30))) self.assertListEqual(list(L['track_test']), list(expected_result))
def test_tsv_sniffer_force(self): # These are all tsv files # sniffer correctly loads locations. a = gl.genelist(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=gl.format.sniffer) d = gl.genome(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=gl.format.sniffer) # Microarrays and delayedlists can't be sniffed # Make sure glbase is not just bodging it all in in one key: self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"])
def test_genelist_from_pandas(self): glb = gl.genelist() glb.from_pandas(self.df) self.assertEqual(len(glb), self.df.shape[0]) self.assertSetEqual(set(glb[0].values()), set(self.df.loc[[0]].values[0])) self.assertSetEqual(set(glb[1].values()), set(self.df.loc[[1]].values[0])) self.assertSetEqual(set(glb[3].values()), set(self.df.loc[[3]].values[0]))
def test_gzipped_delayedlist(self): self.b = glbase3.genelist(filename="test_data/array_data.csv", format=format) self.a = glbase3.delayedlist(filename="test_data/array_data.csv.gz", format=format, gzip=True) self.assertEqual(len(self.a), len(self.b)) for item in self.a: self.assertEqual(item["name"], "Lypla1") self.assertEqual(item["array_systematic_name"], 'scl000965.1_10-S') break
def test_pileup_respect_strand(self): t = glbase3.flat_track(filename="/tmp/test.flat", bin_format="f") g = glbase3.genelist(filename="test_data/track_test.bed", format=glbase3.format.bed).pointify().expand('loc', 15) L, _ = t.pileup(genelists=g, filename="test_images/test_output_respect_strand.png", bandwidth=15, respect_strand=True) expected_result = numpy.zeros(30) expected_result += 14.5 # IF respect strand is true then you will get 4 arrays --> --> and <-- <-- the average of the 4 # will be 14.5 for all points. # Note this is identical to the above test_draw_pielup() # except respect_strand=False self.assertListEqual(list(L['track_test']), list(expected_result))
def test_force_tsvarg(self): form = dict(tss_loc=1, skiplines=0) # This loads tss_loc as strings form_delayed = dict(tss_loc=1, skiplines=0) # delayedlists must have skiplines a = gl.genelist(filename="test_data/mm9_refGene.tsv", force_tsv=True, format=form) c = gl.delayedlist(filename="test_data/mm9_refGene.tsv", format=form, force_tsv=True) d = gl.genome(filename="test_data/mm9_refGene.tsv", format=form_delayed, force_tsv=True) e = gl.expression(filename="test_data/mm9_refGene.tsv", format=form, force_tsv=True, expn="column[5:]") # fake array data # must go last as it modifies format # Make sure glbase is not just bodging it all in in one key: self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", c[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"]) # dls should work as __getitem__() will return the zeroth entry. self.assertEqual("chr1:134212701-134212701", e[0]["tss_loc"])
def test_removeDuplicatesByLoc_delete_any_matches(self): a = [ { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=100, right=200) }, { 'loc': glbase3.location(chr=1, left=130, right=230) }, { 'loc': glbase3.location(chr=1, left=130, right=230) }, { 'loc': glbase3.location(chr=1, left=9800, right=9990) }, # across bucket { 'loc': glbase3.location(chr=1, left=10001, right=10200) }, ] gl = glbase3.genelist() gl.load_list(a) dups = gl.removeDuplicatesByLoc('pointify_expand', 'loc', 10, delete_any_matches=True) self.assertEqual(len(dups), 2) dups = gl.removeDuplicatesByLoc('overlap', 'loc', 0, delete_any_matches=True) self.assertEqual(len(dups), 2)
def test_load_FASTA_gzips(self): self.b = glbase3.genelist(filename="test_data/Fasta_file.fa.gz", format=glbase3.format.fasta, gzip=True) self.assertEqual( self.b[0]['seq'], 'AAATctggatacagtggcctttatttctagttccagtgactgggagactgaaacaagagagtcacttgagtacaggagtgcaaggctagcttgagcaatatagtaagactatctcaaaaTGTGAATTtagatcaacagaattgacatcaagaaaaatactgatatcactcaaagcaatctacagattcaacacaatctccatcaacatgacaatgacttccatcaGCATGACAATGACTCCATCAACATGCCAATGGGCCCCATCAACATAACAATGACCCCTATCATCATGACAATGATCCCCATCAACATGACAATGACCTCCATCAACATGACAATTACTCCTGTCAACATGCCAATtgttggggttcagaagtcaccctgcaaaccacaagaacact' ) self.assertEqual(self.b[0]['name'], 'loc_chr3:122137044-122138000_n1') self.assertEqual( self.b[1]['seq'], 'CCCGTGAGCCCCTGCCGCACCCGCCGGTGTGCGGTTTAGCGCCGCGGTCAGTTGGGCCCTGGCGTTGTGTCGCGTCGGGAGCGTGTCCGCCTCGCGGCGGCTAGACGCGGGTGTCGCCGGGCTCCGACGGGTGGCCTATCCAGGGCTCGCCCCCGCCGTCCCCCGCCTGCCCGTCCCGGTGGTGGTCGTTGGTGTGGGGAGTGAATGGTGCTACCGGTCATTCCCTCCCGCGTGGTTTGACTGTCTCGCCGGTGTCGCGCTTCTCTTTCCGCCAACCCCCACGCCAACCCACCGCCCTGTGCTCCGCGCCCGGTGCGGTCGACGTTCCGGCTCTCCCGATGCCGAGGGGTTCGGGATTTGTGCCGGGGACGGAGGGGAGAGCGGATAAGAGAGGTGTCGGA' ) self.assertEqual(self.b[1]['name'], 'loc_chr17:39450772-39457159_n2')
def test_force_tsv_format(self): form = dict(tss_loc=1, force_tsv=True, chr=1) form_delayed = dict(tss_loc=1, force_tsv=True, skiplines=0) # delayedlists must have skiplines a = gl.genelist(filename="test_data/mm9_refGene.tsv", format=form) c = gl.delayedlist(filename="test_data/mm9_refGene.tsv", format=form_delayed) d = gl.genome(filename="test_data/mm9_refGene.tsv", format=form) e = gl.expression(filename="test_data/mm9_refGene.tsv", format=form, expn="column[5:]") # must go last as it modifies format # Make sure glbase is not just bodging it all in in one key: self.assertEqual("chr1:134212701-134212701", a[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", c[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", d[0]["tss_loc"]) self.assertEqual("chr1:134212701-134212701", e[0]["tss_loc"])
def test_sort_location(self): a = [{ 'loc': location("chr1:1000-2000") }, { 'loc': location("chr1:1001-2000") }, { 'loc': location("chr1:1000-2001") }, { 'loc': location("chr2:1050-2000") }, { 'loc': location("chr1:999-2000") }] gl = glbase3.genelist() gl.load_list(a) gl.sort('loc') self.assertEqual(str(gl[-1]['loc']), "chr2:1050-2000") self.assertEqual(str(gl[0]['loc']), "chr1:999-2000")
def test_motif_fasta_scan(self): fasta = [ 'ACcactcacccattgtaaAAAcCCCAaaaa', 'ACACCCATTGTc', 'cagtgtCCtggcC', 'CAGTCtCaTTgtC', 'NNNNNNNNNNNNN', 'GCGCGCGCGCGCGC'] fasta = [{'seq': s} for s in fasta] fgl = gl.genelist() fgl.load_list(fasta) r = self.sox2.scan_sequences(fgl) #print(r.all()) #r.saveTSV('') f = r.get(self.sox2.name, 'Found') # Found seqs only; #print(f) self.assertEqual(len(r), 6) self.assertEqual(len(f), 3) #print(f) self.assertDictEqual(f[0], {'seq': 'ACcactcacccattgtaaAAAcCCCAaaaa', 'Sox2': 'Found', 'Sox2_seqs': 'cattgta'})
def test_load_gzips(self): self.b = glbase3.genelist(filename="test_data/array_data.csv.gz", format=glbase3.format.sniffer, gzip=True) self.assertDictEqual( self.b[-1], { 'name': 'Pdia4', 'GFP': 1.18, 'Mash': 0.6, 'array_systematic_name': 'scl29051.11.1_27-S', 'refseq': 'NM_009787', 'entrez': 12304 }) self.assertDictEqual( self.b[2], { 'name': 'Srpr', 'GFP': 1, 'Mash': 0.77, 'array_systematic_name': 'scl0067398.1_126-S', 'refseq': 'NM_026130', 'entrez': 67398 })
def test_chip_seq_cluster_heatmap_error(self): no_loc_gl = glbase3.genelist() no_loc_gl.load_list([{ 'name': 'missing' }, { 'name': 'a' }, { 'name': 'loc' }, { 'name': 'key' }]) self.assertRaises( ValueError, self.g.chip_seq_cluster_heatmap, [self.data1, self.data2, self.data3], []) # Fails at a differnet stage, but passes the assertion self.assertRaises(glbase3.errors.AssertionError, self.g.chip_seq_cluster_heatmap, [self.data1, self.data2, no_loc_gl], []) self.assertRaises(glbase3.errors.AssertionError, self.g.chip_seq_cluster_heatmap, [self.data1, no_loc_gl, no_loc_gl], [])
def setUp(self): data = [{ "name": "Nanog", "items": 2 }, { "name": "Nanog", "items": 3 }, { "name": "Nanog", "items": 4 }, { "name": "Pou5f1", "items": 5 }, { "name": "Pou5f1", "items": 6 }, { "name": "Sox2", "items": 7 }] self.a = gl.genelist() self.a.load_list(data)
def setUp(self): self.a = glbase3.genelist(filename="test_data/array_data.csv", format=glbase3.format.sniffer)
def setUp(self): self.a = glbase3.genelist(filename="test_data/testA.csv", format=glbase3.format.sniffer) self.g = glbase3.genome(filename="test_data/test-genome.csv", format=glbase3.format.sniffer)
print(pd.DataFrame(adata.uns['rank_genes_groups']['names'])) print() topall = pd.DataFrame(adata.uns['rank_genes_groups']['names']) # get all; fcs = pd.DataFrame(adata.uns['rank_genes_groups']['logfoldchanges']) padj = pd.DataFrame(adata.uns['rank_genes_groups']['pvals_adj']) topall.to_csv('top100.csv') # Go through and trim the TEs: TEs = set( genelist(filename='../../TE_genes_id.mm10.txt', format={ 'name': 0, 'force_tsv': True })['name']) newcols = {} groups = list(topall.columns.values) for group in groups: newcols[group] = [] t = zip( [i[group] for i in adata.uns['rank_genes_groups']['names']], [i[group] for i in adata.uns['rank_genes_groups']['logfoldchanges']], [i[group] for i in adata.uns['rank_genes_groups']['pvals_adj']])
""" This is the code from the publication. relative paths were removed in the text for clarity. """ import glbase3 as gl peaks = gl.genelist(filename="../shared_raw_data/macs_list.xls.gz", format=gl.format.macs_summit, gzip=True) bed = gl.genelist(filename="../shared_raw_data/Sox2_Oct4_ol_w100_annot.bed.gz", format=gl.format.bed, gzip=True) fasta = gl.genelist(filename="../shared_raw_data/Fasta_file.fa.gz", format=gl.format.fasta, gzip=True) print(peaks) peaks = peaks[0:1] # list was truncated for clarity: print(peaks.pointify()) # Take the middle point of the interval print(peaks.expand("loc", 100)) # expand the left and right border by 100 bp print(bed) print(fasta)
def measure_te_anchors(self): ''' **Purpose** Make a crude measure of the TE <-> TE TE <-> - - <-> - possible arrangements ''' te = {'TE <-> TE': 0, 'TE <-> -': 0, '- <-> -': 0} res_te_te = {} res_te_nn = {} # The format of the TE file is: # read1.chrom read1.left read1.right read1.labels read1.type read2.chrom read2.left read2.right read2.labels read2.type print("Measures anchors...") total = 0 oh = gzip.open(self.filename, 'rt') for line in oh: r = line.strip().split('\t') # measure TE anchors if 'TE' in r[4] and 'TE' in r[9]: te['TE <-> TE'] += 1 elif 'TE' in r[4] or 'TE' in r[9]: te['TE <-> -'] += 1 else: te['- <-> -'] += 1 total += 1 if total % 1000000 == 0: print('Processed: {:,}'.format(total)) #break # Measure TEs in detail if 'TE' in r[4] and 'TE' in r[9]: # possible to have more than one TE: tel = [ i.strip() for i in r[3].split(',') if ':' in i ] # can also hoover up some genes, so use ':' to discriminate TEs ter = [i.strip() for i in r[8].split(',') if ':' in i] combs = product(tel, ter) combs = [tuple(sorted(i)) for i in combs] # sort to make it unidirectional combs = set(combs) for c in combs: if c not in res_te_te: res_te_te[c] = 0 res_te_te[c] += 1 elif 'TE' in r[4] or 'TE' in r[9]: if 'TE' in r[4]: TE = [i.strip() for i in r[3].split(',') if ':' in i] elif 'TE' in r[9]: TE = [i.strip() for i in r[8].split(',') if ':' in i] for t in TE: if ':' not in t: continue if t not in res_te_nn: res_te_nn[t] = 0 res_te_nn[t] += 1 oh.close() print('\nmeasure_te_anchors():') print(' TE <-> TE : {:,} ({:.2%})'.format(te['TE <-> TE'], te['TE <-> TE'] / total)) print(' TE <-> -- : {:,} ({:.2%})'.format(te['TE <-> -'], te['TE <-> -'] / total)) print(' -- <-> -- : {:,} ({:.2%})'.format(te['- <-> -'], te['- <-> -'] / total)) print() oh = open('%s_crude_measures.txt' % self.project_name, 'w') oh.write('TE <-> TE : {:,} ({:.5%})\n'.format(te['TE <-> TE'], te['TE <-> TE'] / total)) oh.write('TE <-> -- : {:,} ({:.5%})\n'.format(te['TE <-> -'], te['TE <-> -'] / total)) oh.write('-- <-> -- : {:,} ({:.5%})\n'.format(te['- <-> -'], te['- <-> -'] / total)) oh.close() oh_te_te = open('%s_te-te_anchor_frequencies.tsv' % self.project_name, 'w') oh_te_te.write('%s\n' % '\t'.join([ 'TE1', 'TE2', 'RPM', 'RPM per kbp TE', 'TE1_genome_freq', 'TE2_genome_freq' ])) for k in sorted(list(res_te_te)): te1 = self.genome._findDataByKeyLazy('name', k[0]) te2 = self.genome._findDataByKeyLazy('name', k[1]) rpm = res_te_te[k] / total * 1e6 joint_kb_size = te1['genome_count'] + te2['genome_count'] rpmpkbte = (rpm / joint_kb_size) * 1e3 line = { 'te1': k[0], 'te2': k[1], 'rpm': rpm, 'rpmpkbte': rpmpkbte, 'te1_genome_freq': te1['genome_percent'] / 100.0, # Convert back to fraction; 'te2_genome_freq': te2['genome_percent'] / 100.0, #'enrichment': #!?!?! } oh_te_te.write( '{i[te1]}\t{i[te2]}\t{i[rpm]}\t{i[rpmpkbte]}\t{i[te1_genome_freq]}\t{i[te2_genome_freq]}\n' .format(i=line)) #print('{i[te1]}\t{i[te2]}\t{rpm}\t{rpmkbte}\t{te1_genome_freq}\t{te2_genome_freq}\n'.format(i=line)) oh_te_te.close() te_nn = glbase3.genelist() te_nn.load_list([{ 'name': k, 'count': res_te_nn[k] } for k in res_te_nn]) te_nn = te_nn.map(genelist=self.genome, key='name') for te in te_nn: te['RPM'] = (res_te_nn[te['name']] / total) * 1e6 te['RPM per kbp of TE'] = (te['RPM'] / te['genome_count']) * 1e3 #te['enrichment'] = te_nn._optimiseData() te_nn.sort('name') te_nn.saveTSV('%s_te-nn_anchor_frequencies.tsv' % self.project_name, key_order=[ 'name', 'count', 'genome_count', 'genome_percent', 'RPM' ]) return
def test_len(self): self.b = glbase3.genelist(filename="test_data/array_data.csv", format=format) self.a = glbase3.delayedlist(filename="test_data/array_data.csv", format=format) self.assertEqual(len(self.a), len(self.b))
newl = [] for k in keys: d = get_details(k) writer.writerow([d[k] for k in key_order]) newl.append(d) oh.close() print() print("\n>>>Citation:") gses = list(gse_to_reference.keys()) gses.sort() r = ", ".join(["%s (%s:%s)" % (gse, gse_to_reference[gse], pmid_to_gse[gse]) for gse in gses]) print(r) print() gl = glbase3.genelist() gl.load_list(newl) gl.save("sample_map.glb") print() print("Number of studies in domain:") res = {} for k in gene_layer_name: res[k] = [] for ct in gene_layer_name[k]: res[k] += sample_to_gse[ct] res[k] = set(res[k]) for k in sorted(res): print(k, len(res[k]))