def _plot_motif(self, data, subseqs): # original structure input was a PWM if isinstance(subseqs[0], np.ndarray): rnas, structs = [], [] for pwm in subseqs: idx = np.argmax(~np.isclose(pwm, 0), axis=1) rnas.append(''.join(data.alpha_coder.alph0[x] for x in idx // len(data.alpha_coder.alph1))) structs.append( np.zeros((len(rnas[-1]), len(data.alpha_coder.alph1)), dtype=np.float32)) for i, val in enumerate(idx): val = val - val % len(data.alpha_coder.alph1) structs[-1][i] = pwm[i, val:(val + len(data.alpha_coder.alph1))] structs = np.sum(structs, 0) / len(structs) logo_rna = Motif(data.alpha_coder.alph0, sequences=rnas) logo_struct = Motif(data.alpha_coder.alph1, pwm=structs) return (logo_rna, logo_struct) # original structure input was a string if data.is_rna: rnas, structs = zip(*(data.alpha_coder.decode(seq) for seq in subseqs)) logo_rna = Motif(data.alpha_coder.alph0, sequences=rnas) logo_struct = Motif(data.alpha_coder.alph1, sequences=structs) return (logo_rna, logo_struct) # no structure input, just sequence return Motif(data.one_hot_encoder.alphabet, sequences=subseqs)
def setUp(self): self.ref_pwm = np.array( [[0, 0, 0.25, 0], [0.25, 0, 0, 0], [0, 0, 0, 0.25], [ 0, 0, 0, 0.25 ], [0.25, 0, 0, 0], [0, 0.25, 0, 0], [0.25, 0, 0, 0]], dtype=np.float32) self.m = Motif("ACGT", ["GATTACA"]) self.m2 = Motif("ACGT", pwm=self.ref_pwm)
def test_utils_save_as_meme(self): logos = [Motif('ACGT', ['GATTACA']), Motif('ACGT', ['AAAA'])] utils.save_as_meme(logos, gettempdir() + "/test.meme") with open(self.folder + "/data/ref.meme", 'rt') as handle: ref = handle.read() with open(gettempdir() + "/test.meme", 'rt') as handle: comp = handle.read() self.assertTrue(ref == comp) remove(gettempdir() + "/test.meme")
def _get_optimized_input(self, model, data, layer_name, node_index, boundary, lr, steps, colors_sequence, colors_structure): for attempt in range(5): input_data = np.random.uniform(-boundary, +boundary, (1, self.params["input_shape"][0], self.params["input_shape"][1])) input_data, success = self._optimize_input(model, layer_name, node_index, input_data, lr, steps) if success: break if not success: print("Warning: loss did not converge for node {} in layer '{}'".format(node_index, layer_name)) input_data = np.apply_along_axis(utils.softmax, 1, input_data) if not data.is_rna: return [Motif(data.one_hot_encoder.alphabet, pwm = input_data).plot(colors_sequence, scale=0.25)] else: if data.is_rna_pwm: annotation_seq = ''.join(x*len(data.alpha_coder.alph1) for x in data.alpha_coder.alph0) annotation_struct = ''.join(data.alpha_coder.alph1 * len(data.alpha_coder.alph0)) else: annotation_seq, annotation_struct = data.alpha_coder.decode(data.alpha_coder.alphabet) pwm_struct = self._extract_pwm(input_data, annotation_struct, data.alpha_coder.alph1) pwm_seq = self._extract_pwm(input_data, annotation_seq, data.alpha_coder.alph0) motif_struct = Motif(data.alpha_coder.alph1, pwm = pwm_struct).plot(colors_structure, scale=0.25) motif_seq = Motif(data.alpha_coder.alph0, pwm = pwm_seq).plot(colors_sequence, scale=0.25) return [motif_seq, motif_struct]
class Test_Motif(unittest.TestCase): def setUp(self): self.ref_pwm = np.array( [[0, 0, 0.25, 0], [0.25, 0, 0, 0], [0, 0, 0, 0.25], [ 0, 0, 0, 0.25 ], [0.25, 0, 0, 0], [0, 0.25, 0, 0], [0.25, 0, 0, 0]], dtype=np.float32) self.m = Motif("ACGT", ["GATTACA"]) self.m2 = Motif("ACGT", pwm=self.ref_pwm) def test_motif_init(self): self.assertTrue(self.m.alphabet == "ACGT") self.assertTrue(self.m2.alphabet == "ACGT") def test_motif_valid_pwm(self): self.assertTrue(self.m.pwm.shape == (7, 4)) self.assertTrue((self.m.pwm >= 0).all() and (self.m.pwm <= 1).all()) self.assertTrue(np.allclose(np.sum(self.m.pwm, axis=1), [1] * 7)) self.assertTrue(self.m2.pwm.shape == (7, 4)) self.assertTrue((self.m2.pwm >= 0).all() and (self.m2.pwm <= 1).all()) self.assertTrue(np.allclose(np.sum(self.m2.pwm, axis=1), [1] * 7)) self.assertTrue(np.allclose(self.m.pwm, self.m2.pwm)) def test_motif_valid_entropies(self): self.assertTrue(self.m.entropies.shape == (7, )) self.assertTrue((self.m.entropies >= 0).all() and (self.m.entropies <= 2).all()) self.assertTrue(self.m2.entropies.shape == (7, )) self.assertTrue((self.m2.entropies >= 0).all() and (self.m2.entropies <= 2).all()) self.assertTrue(np.allclose(self.m.entropies, self.m2.entropies)) def test_motif_plot(self): self.assertTrue(isinstance(self.m.plot(), Image.Image)) self.assertTrue(isinstance(self.m2.plot(), Image.Image))
from random import choice import gzip from pysster.Motif import Motif def rand_dna(length): return "".join(choice("ACGT") for x in range(length)) num = 5000 with gzip.open("artifical_pos.fasta.gz", "wt") as handle: seqs = [ rand_dna(20) + "CCCCCCCCCC" + rand_dna(20) + "GGGGGGGGGG" + rand_dna(80) for x in range(num) ] for x in range(num): handle.write(">1\n{}\n".format(seqs[x])) Motif("ACGT", seqs).plot().save("pos_half1.png") seqs = [ rand_dna(80) + "AAAAAAAAAA" + rand_dna(20) + "TTTTTTTTTT" + rand_dna(20) for x in range(num) ] for x in range(num): handle.write(">1\n{}\n".format(seqs[x])) Motif("ACGT", seqs).plot().save("pos_half2.png") with gzip.open("artifical_neg.fasta.gz", "wt") as handle: seqs = [rand_dna(140) for x in range(num * 2)] Motif("ACGT", seqs).plot().save("neg.png") for x in range(num * 2): handle.write(">1\n{}\n".format(seqs[x]))