def test_jaspar(self): """correctly load jaspar formatted counts matrix""" path = "data/sample.jaspar" mid, pwm = jaspar.read(path) assert mid == ["PSSMid", "HGNCsymbol"], "ID line wrong" # note state indices are ordered by moltype base_order = list(get_moltype("dna")) expect = [ [35, 374, 30, 121, 6, 121, 33], [0, 10, 0, 0, 3, 2, 44], [352, 3, 354, 268, 360, 222, 155], [2, 2, 5, 0, 10, 44, 157], ] assert_array_equal(pwm.array, array(expect).T) self.assertEqual(pwm[0, "A"], 352) self.assertEqual(pwm[3, "T"], 121)
""" Draw sequence logos =================== Sequence logo's display sequence information. They're extensively applied to transcription factor binding site (TFBS) display. They can also be applied to sequence alignments more generally. """ #%% # Drawing logo for a TFBS # ####################### # # We use the TFBS for the TAT box binding protein. from cogent3 import load_aligned_seqs from cogent3.parse import jaspar _, pwm = jaspar.read("../../data/tbp.jaspar") freqarr = pwm.to_freq_array() freqarr[:5] # illustrating the contents of the MotifFreqsArray # %% logo = freqarr.logo() logo.show(height=250, width=500) #%% # Drawing a sequence logo from a multiple sequence alignment # ########################################################## # # This can be done for an entire alignment, but bear in mind it can take some time to render. Note that we include gap characters in the display. aln = load_aligned_seqs("../../data/brca1-bats.fasta", moltype="dna") l = aln[:311].seqlogo(height=300, width=500, wrap=60, vspace=0.05)