def test_gds_info(self): gds_info = GDSInfo() self.assertIsNotNone(gds_info) self.assertGreater(len(gds_info.keys()), 0) self.assertGreater(len(gds_info.items()), 0) self.assertGreater(len(gds_info.values()), 0) self.assertIsNotNone(gds_info[self.test_sample]) self.assertEqual(gds_info[self.test_sample]['genes'], 9561) self.assertEqual(int(gds_info[self.test_sample]['sample_count']), 4) self.assertEqual(len(gds_info[self.test_sample]['subsets']), 2)
def valid(info, n=40): """Return a set of subset types containing more than n samples in every subset""" invalid = set() subsets = set([sinfo["type"] for sinfo in info["subsets"]]) for sampleinfo in info["subsets"]: if len(sampleinfo["sample_id"]) < n: invalid.add(sampleinfo["type"]) return subsets.difference(invalid) def report(stypes, info): """Pretty-print GDS and valid susbset types""" for id, sts in stypes: print(id) for st in sts: gds = info[id] print(" %s:" % st + ", ".join([ "%s/%d" % (sinfo["description"], len(sinfo["sample_id"])) for sinfo in gds["subsets"] if sinfo["type"] == st ])) gdsinfo = GDSInfo() valid_subset_types = [(id, valid(info)) for id, info in sorted(gdsinfo.items()) if valid(info)] report(valid_subset_types, gdsinfo) print('datasets = ' + str(len(valid_subset_types))) print('type subsets = ' + str(sum(len(b) for _, b in valid_subset_types)))