def test_to_freqs(self): """produces a freqs array""" data = array([[2, 4], [3, 5], [4, 8]]) marr = MotifCountsArray(array(data), "AB") expect = data / vstack(data.sum(axis=1)) got = marr.to_freq_array() assert_allclose(got.array, expect)
def test_to_freqs_1d(self): """produce a freqs array from 1D counts""" data = [43, 48, 114, 95] total = sum(data) a = MotifCountsArray([43, 48, 114, 95], motifs=("T", "C", "A", "G")) f = a.to_freq_array() assert_allclose(f.array, array([v / total for v in data], dtype=float))
def test_sliced_range(self): """a sliced range should preserve row indices""" motifs = ("A", "C", "G", "T") names = ["FlyingFox", "DogFaced", "FreeTaile"] data = [[316, 134, 133, 317], [321, 136, 123, 314], [331, 143, 127, 315]] counts = MotifCountsArray(data, motifs, row_indices=names) self.assertEqual(counts.keys(), names) subset = counts[:2] self.assertEqual(subset.keys(), names[:2])
def test_slicing(self): """slice by keys should work""" counts = MotifCountsArray( [[3, 2, 3, 2], [3, 2, 3, 2]], ["A", "C", "G", "T"], row_indices=["DogFaced", "FlyingFox"], ) freqs = counts.to_freq_array() got = freqs["FlyingFox"].to_array() assert_allclose(got, [0.3, 0.2, 0.3, 0.2])
def test_load_tabular_motif_counts_array(self): """correctly loads tabular data for MotifCountsArray""" data = [[2, 4], [3, 5], [4, 8]] mca = MotifCountsArray(data, "AB") loader = io_app.load_tabular(sep="\t", as_type="motif_counts") with TemporaryDirectory(dir=".") as dirname: writer = io_app.write_tabular(data_path=dirname, format="tsv") outpath = join(dirname, "delme.tsv") writer.write(mca, identifier=outpath) new = loader(outpath) self.assertEqual(mca.to_dict(), new.to_dict())
def test_to_freqs_pseudocount(self): """produces a freqs array with pseudocount""" data = array([[2, 4], [3, 5], [0, 8]]) marr = MotifCountsArray(array(data), "AB") got = marr.to_freq_array(pseudocount=1) adj = data + 1 expect = adj / vstack(adj.sum(axis=1)) assert_allclose(got.array, expect) got = marr.to_freq_array(pseudocount=0.5) adj = data + 0.5 expect = adj / vstack(adj.sum(axis=1)) assert_allclose(got.array, expect)
def test_to_pssm_pseudocount(self): """produces a PSSM array with pseudocount""" data = array([ [10, 30, 50, 10], [25, 25, 25, 25], [5, 80, 0, 10], [70, 10, 10, 10], [60, 15, 0, 20], ]) marr = MotifCountsArray(array(data), "ACGT") adj = data + 1 got = marr.to_pssm(pseudocount=1) freqs = marr._to_freqs(pseudocount=1) expect = log2(freqs / 0.25) assert_allclose(got.array, expect, atol=1e-3)
def test_construct_succeeds(self): """construct from int array or list""" from cogent3.maths.stats.number import CategoryCounter states = "ACGT" rows = [CategoryCounter([b] * 20) for b in "ACGT"] rows = [r.tolist(states) for r in rows] pwm = MotifCountsArray(rows, states) data = [[2, 4], [3, 5], [4, 8]] got = MotifCountsArray(array(data), "AB") self.assertEqual(got.array.tolist(), data) got = MotifCountsArray(data, "AB") self.assertEqual(got.array.tolist(), data)
def read(filepath): """returns matrixid and MotifCountsArray matrix""" with open(filepath) as infile: matrix = [] states = [] for line in infile: line = line.strip() if line.startswith(">"): identifier = line[1:].split() elif line: line = _brackets.sub("", line) line = line.split() states.append(line.pop(0).upper()) matrix.append([int(i) for i in line]) matrix = dict(zip(states, matrix)) if len(states) == 4: name = "rna" if "U" in states else "dna" else: name = "protein" states = list(get_moltype(name)) matrix = array([matrix[s] for s in states], dtype=int).T pwm = MotifCountsArray(matrix, states) return identifier, pwm
def test_getitem(self): """slicing should return correct class""" data = array([[2, 4], [3, 5], [4, 8]]) marr = MotifCountsArray(array(data), "AB") # print(marr[[1, 2], :]) self.assertEqual(marr[0].array.tolist(), [2, 4]) self.assertEqual(marr[0, "B"], 4) self.assertEqual(marr[0, :].array.tolist(), [2, 4]) self.assertEqual(marr[:, "A"].array.tolist(), [[2], [3], [4]]) self.assertEqual(marr[:, "A":"B"].array.tolist(), [[2], [3], [4]]) self.assertEqual(marr[1, "A"], 3) marr = MotifCountsArray(array(data), "AB", row_indices=["a", "b", "c"]) self.assertEqual(marr["a"].array.tolist(), [2, 4]) self.assertEqual(marr["a", "B"], 4) self.assertEqual(marr["a", :].array.tolist(), [2, 4]) self.assertEqual(marr[:, "A"].array.tolist(), [[2], [3], [4]]) self.assertEqual(marr[:, "A":"B"].array.tolist(), [[2], [3], [4]]) self.assertEqual(marr["b", "A"], 3)
def test_to_pssm(self): """produces a PSSM array""" data = array([ [10, 30, 50, 10], [25, 25, 25, 25], [5, 80, 5, 10], [70, 10, 10, 10], [60, 15, 5, 20], ]) marr = MotifCountsArray(array(data), "ACGT") got = marr.to_pssm() expect = array([ [-1.322, 0.263, 1.0, -1.322], [0.0, 0.0, 0.0, 0.0], [-2.322, 1.678, -2.322, -1.322], [1.485, -1.322, -1.322, -1.322], [1.263, -0.737, -2.322, -0.322], ]) assert_allclose(got.array, expect, atol=1e-3)
def test_construct_fails(self): """fails if given wrong data type or no data""" # can't use a string data = [["A", "A"], ["A", "A"], ["A", "A"]] with self.assertRaises(ValueError): got = MotifCountsArray(data, "AB") # or a float data = [[1.1, 2.1], [0.0, 2.1], [3.0, 4.5]] with self.assertRaises(ValueError): got = MotifCountsArray(data, "AB") # or be empty with self.assertRaises(ValueError): got = MotifCountsArray([], "AB") with self.assertRaises(ValueError): got = MotifCountsArray([[], []], "AB") data = [[2, 4], [3, 5], [4, 8]] with self.assertRaises(ValueError): pssm = PSSM(data, "ACGT")
def test_write_tabular_motif_counts_array(self): """correctly writes tabular data for MotifCountsArray""" data = [[2, 4], [3, 5], [4, 8]] mca = MotifCountsArray(data, "AB") loader = io_app.load_tabular(sep="\t") with TemporaryDirectory(dir=".") as dirname: writer = io_app.write_tabular(data_path=dirname, format="tsv") outpath = join(dirname, "delme.tsv") writer.write(mca, identifier=outpath) new = loader(outpath) # when written to file in tabular form # the loaded table will have dim-1 dim-2 as column labels # and the key-values pairs listed below; in dict form... expected = { 0: { "dim-1": 0, "dim-2": "A", "value": 2 }, 1: { "dim-1": 0, "dim-2": "B", "value": 4 }, 2: { "dim-1": 1, "dim-2": "A", "value": 3 }, 3: { "dim-1": 1, "dim-2": "B", "value": 5 }, 4: { "dim-1": 2, "dim-2": "A", "value": 4 }, 5: { "dim-1": 2, "dim-2": "B", "value": 8 }, } self.assertEqual(expected, new.to_dict())
def test_to_dict(self): """correctly converts to a dict""" motifs = ["A", "C", "D"] counts = [[4, 0, 0]] marr = MotifCountsArray(counts, motifs) self.assertEqual(marr.to_dict(), {0: {"A": 4, "C": 0, "D": 0}})
def test_str_repr(self): """exercise str and repr""" data = array([[2, 4], [3, 5], [4, 8]]) marr = MotifCountsArray(array(data), "AB") str(marr) repr(marr)
def test_take(self): """take works like numpy take, supporting negation""" data = array([[2, 4, 9, 2], [3, 5, 8, 0], [4, 8, 25, 13]]) marr = MotifCountsArray(data, ["A", "B", "C", "D"]) # fails if don't provide an indexable indices with self.assertRaises(ValueError): marr.take(1, axis=1) # indexing columns using keys cols = marr.take(["A", "D"], axis=1) assert_allclose(cols.array, data.take([0, 3], axis=1)) cols = marr.take(["A", "D"], negate=True, axis=1) assert_allclose(cols.array, data.take([1, 2], axis=1)) # indexing columns using indexs cols = marr.take([0, 3], axis=1) assert_allclose(cols.array, data.take([0, 3], axis=1)) cols = marr.take([0, 3], negate=True, axis=1) assert_allclose(cols.array, data.take([1, 2], axis=1)) marr = MotifCountsArray(data, ["A", "B", "C", "D"], row_indices=["a", "b", "c"]) # rows using keys rows = marr.take(["a", "c"], axis=0) assert_allclose(rows.array, data.take([0, 2], axis=0)) rows = marr.take(["a"], negate=True, axis=0) assert_allclose(rows.array, data.take([1, 2], axis=0)) # rows using indexes rows = marr.take([0, 2], axis=0) assert_allclose(rows.array, data.take([0, 2], axis=0)) rows = marr.take([0], negate=True, axis=0) assert_allclose(rows.array, data.take([1, 2], axis=0)) # 1D profile marr = MotifCountsArray(data[0], ["A", "B", "C", "D"]) cols = marr.take([0], negate=True, axis=1) assert_allclose(cols.array, data[0].take([1, 2, 3]))
def test_iter(self): """iter count array traverses positions""" data = [[2, 4], [3, 5], [4, 8]] got = MotifCountsArray(array(data), "AB") for row in got: self.assertEqual(row.shape, (2, ))