class App(object): def __init__(self, host, user, passwd): self.kb = KB(driver="omero")(host, user, passwd) self.logger = logging.getLogger() def compute(self, maker, model, release): mset = self.kb.get_snp_markers_set(maker, model, release) if not mset: raise ValueError("SNPMarkersSet[%s,%s,%s] has not been defined." % (maker, model, release)) # projector = (np.arange(0, 100), np.array([101, 109]), np.arange(110,N)) # selector = kb.build_selector( # s = self.kb.get_gdo_iterator(mset, selector, projector) s = self.kb.get_gdo_iterator(mset) # -- start = time.clock() counts = algo.count_homozygotes(s) print "counts on %d:" % counts[0], time.clock() - start start = time.clock() mafs = algo.maf(None, counts) print "mafs on %d:" % counts[0], time.clock() - start start = time.clock() hwe = algo.hwe(None, counts) print "hwe on %d:" % counts[0], time.clock() - start
def main(argv): parser = make_parser() args = parser.parse_args(argv) log_level = getattr(logging, args.loglevel) kwargs = {'format' : LOG_FORMAT, 'datefmt' : LOG_DATEFMT, 'level' : log_level} if args.logfile: kwargs['filename'] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger() kb = KB(driver='omero')(args.host, args.user, args.passwd) logger.info('Loading GenotypeDataSample objects') dsamples = kb.get_objects(kb.GenotypeDataSample) logger.info('Loaded %d objects' % len(dsamples)) logger.info('Loading SNPMarkersSet') query = 'SELECT snpm FROM SNPMarkersSet snpm WHERE snpm.label = :mset_label' mset = kb.find_all_by_query(query, {'mset_label' : args.marker_set})[0] if not mset: logger.error('Unable to load SNPMarkersSet with label %s' % args.marker_set) sys.exit(2) else: logger.info('Object loaded') gdo_iterator = kb.get_gdo_iterator(mset, dsamples[:args.fetch_size]) gdos = [] logger.info('Loading GDOs') for gdo in gdo_iterator: logger.info(gdo['vid']) gdos.append(gdo) logger.debug('%d/%d GDOs loaded' % (len(gdos), args.fetch_size)) logger.info('Loaded %d GDOs' % len(gdos))
def main(argv): parser = make_parser() args = parser.parse_args(argv) log_level = getattr(logging, args.loglevel) kwargs = {'format': LOG_FORMAT, 'datefmt': LOG_DATEFMT, 'level': log_level} if args.logfile: kwargs['filename'] = args.logfile logging.basicConfig(**kwargs) logger = logging.getLogger() kb = KB(driver='omero')(args.host, args.user, args.passwd) logger.info('Loading GenotypeDataSample objects') dsamples = kb.get_objects(kb.GenotypeDataSample) logger.info('Loaded %d objects' % len(dsamples)) logger.info('Loading SNPMarkersSet') query = 'SELECT snpm FROM SNPMarkersSet snpm WHERE snpm.label = :mset_label' mset = kb.find_all_by_query(query, {'mset_label': args.marker_set})[0] if not mset: logger.error('Unable to load SNPMarkersSet with label %s' % args.marker_set) sys.exit(2) else: logger.info('Object loaded') gdo_iterator = kb.get_gdo_iterator(mset, dsamples[:args.fetch_size]) gdos = [] logger.info('Loading GDOs') for gdo in gdo_iterator: logger.info(gdo['vid']) gdos.append(gdo) logger.debug('%d/%d GDOs loaded' % (len(gdos), args.fetch_size)) logger.info('Loaded %d GDOs' % len(gdos))
class App(object): def __init__(self, host, user, passwd): self.kb = KB(driver='omero')(host, user, passwd) self.logger = logging.getLogger() def compute(self, maker, model, release): mset = self.kb.get_snp_markers_set(maker, model, release) if not mset: raise ValueError('SNPMarkersSet[%s,%s,%s] has not been defined.' % (maker, model, release)) # projector = (np.arange(0, 100), np.array([101, 109]), np.arange(110,N)) # selector = kb.build_selector( # s = self.kb.get_gdo_iterator(mset, selector, projector) s = self.kb.get_gdo_iterator(mset) #-- start = time.clock() counts = algo.count_homozygotes(s) print 'counts on %d:' % counts[0], time.clock() - start start = time.clock() mafs = algo.maf(None, counts) print 'mafs on %d:' % counts[0], time.clock() - start start = time.clock() hwe = algo.hwe(None, counts) print 'hwe on %d:' % counts[0], time.clock() - start
Note that what we have now is a dictionary that maps individual ids to GenotypeDataSample objects and the latter are only handlers to get to the actual genotyping data, not the data itself. We can, now, do a global check on data quality. """ def do_check(s): counts = algo.count_homozygotes(s) mafs = algo.maf(None, counts) hwe = algo.hwe(None, counts) return mafs, hwe gds0_data_samples = gds0_by_individual.values() s = kb.get_gdo_iterator(mset0, data_samples=gds0_data_samples) mafs, hwe = do_check(s) """ .. Similar to above, but now we subselect on a slice of the markers """ s = kb.get_gdo_iterator(mset0, indices=slice(1, len(mset0)-1), data_samples=gds0_data_samples) mafs, hwe = do_check(s) """ .. Same as above, but now we work on a subset of possible markers
the actual genotyping data, not the data itself. We can, now, do a global check on data quality. """ def do_check(s): counts = algo.count_homozygotes(s) mafs = algo.maf(None, counts) hwe = algo.hwe(None, counts) return mafs, hwe gds0_data_samples = gds0_by_individual.values() s = kb.get_gdo_iterator(mset0, data_samples=gds0_data_samples) mafs, hwe = do_check(s) """ .. Similar to above, but now we subselect on a slice of the markers """ s = kb.get_gdo_iterator(mset0, indices=slice(1, len(mset0) - 1), data_samples=gds0_data_samples) mafs, hwe = do_check(s) """ .. Same as above, but now we work on a subset of possible markers
class markers_set(unittest.TestCase): def __init__(self, name): super(markers_set, self).__init__(name) self.kill_list = [] def setUp(self): self.kb = KB(driver='omero')(OME_HOST, OME_USER, OME_PASSWD) conf = { 'label': 'TEST-%f' % time.time(), 'description': 'unit test garbage', } self.study = self.kb.factory.create(self.kb.Study, conf).save() self.kill_list.append(self.study) self.action = self.kb.create_an_action(self.study) self.kill_list.append(self.action) def tearDown(self): self.kill_list.reverse() for x in self.kill_list: self.kb.delete(x) self.kill_list = [] def create_markers(self, N): def marker_generator(): for i in range(N): label = 'A%f-%d' % (time.time(), i) yield (label, label, 'ACCA[A/B]TACCA') source, context, release = 'unit_testing', 'markers_set', '%f' % time.time() ref_rs_genome, dbsnp_build = 'foo-rs-genome', 123000 lvs = self.kb.create_markers(source, context, release, ref_rs_genome, dbsnp_build, marker_generator(), self.action) return lvs def create_snp_markers_set(self, lvs): label = 'ams-%f' % time.time() maker, model, release = 'FOO', 'FOO1', '%f' % time.time() markers_selection = [(v[1], i, False) for i, v in enumerate(lvs)] mset = self.kb.create_snp_markers_set(label, maker, model, release, len(lvs), markers_selection, self.action) self.kill_list.append(mset) return mset def create_alignments(self, mset, ref_genome, n_duplicates): mset.load_markers() self.assertTrue(len(mset) > 0) n_aligns = len(mset.markers) + n_duplicates pos = [] def insert_duplicates(markers): count = 0 for i, m in enumerate(markers): n_copies = 1 if count < n_duplicates: count += 1 n_copies = 2 r = (m[0], random.randint(1,26), 22 + i*1000, True, 'A' if (i%2)== 0 else 'B', n_copies) yield r r = (m[0], random.randint(1,26), 1 + i*1000, True, 'A' if (i%2)== 0 else 'B', n_copies) pos.append((0,0) if n_copies > 1 else (r[1], r[2])) yield r aligns = [x for x in insert_duplicates(mset.markers)] random.shuffle(aligns) self.kb.align_snp_markers_set(mset, ref_genome, aligns, self.action) return pos def create_data_sample(self, mset, label): conf = { 'label': label, 'status': self.kb.DataSampleStatus.USABLE, 'action': self.action, 'snpMarkersSet': mset, } data_sample = self.kb.factory.create(self.kb.GenotypeDataSample, conf).save() self.kill_list.append(data_sample) return data_sample def create_data_object(self, data_sample, add_nan=False): probs, confs = make_fake_data(data_sample.snpMarkersSet, add_nan) do = self.kb.add_gdo_data_object(self.action, data_sample, probs, confs) self.kill_list.append(do) return probs, confs def create_aligned_mset(self, N, N_dups, ref_genome): lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) pos = self.create_alignments(mset, ref_genome, N_dups) return mset, pos def test_creation_destruction(self): N = 32 lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) mset.load_markers() self.assertEqual(len(mset), N) for lv, m in it.izip(lvs, mset.markers): self.assertEqual(lv[1], m[0]) def test_align(self): N = 16 N_dups = 4 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_alignments(ref_genome) for p, m in it.izip(pos, mset.get_markers_iterator()): self.assertEqual(p, m.position) def test_read_ssc(self): N = 16 N_dups = 4 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_markers(additional_fields=['label']) probs, confs = make_fake_data(mset) sample_id = 'ffoo-%f' % time.time() fn = tempfile.NamedTemporaryFile().name make_fake_ssc(mset, sample_id, probs, confs, fn) probs_1, confs_1 = gio.read_ssc(fn, mset) self.assertAlmostEqual(np.sum(np.abs(probs - probs_1)), 0.0) self.assertAlmostEqual(np.sum(np.abs(confs - confs_1)), 0.0) def test_gdo(self): N = 32 lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) mset.load_markers() data_sample = self.create_data_sample(mset, 'foo-data') probs, confs = self.create_data_object(data_sample) probs1, confs1 = data_sample.resolve_to_data() self.assertTrue((probs == probs1).all()) self.assertTrue((confs == confs1).all()) s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample]) for i, x in enumerate(s): self.assertTrue((probs == x['probs']).all()) self.assertTrue((confs == x['confidence']).all()) self.assertEqual(i, 0) indices = slice(N/4, N/2) s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample], indices=indices) for i, x in enumerate(s): self.assertTrue((probs[:,indices] == x['probs']).all()) self.assertTrue((confs[indices] == x['confidence']).all()) self.assertEqual(i, 0) def test_define_range_selector(self): N, N_dups = 16, 0 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_alignments(ref_genome) pos.sort() if len(pos) > 2: low_pos, high_pos = pos[1], pos[-2] gc_range = (low_pos, high_pos) range_sel = self.kb.SNPMarkersSet.define_range_selector(mset, gc_range) i = 0 for (i, m) in enumerate(mset.get_markers_iterator()): if i in range_sel: self.assertTrue(low_pos <= m.position <= high_pos) else: self.assertTrue(low_pos > m.position or high_pos < m.position) def test_intersect(self): ref_genome = 'g' + ('%f' % time.time())[-14:] N1 = 16 M1 = 2 N2 = N1/2 M2 = 1 lvs = self.create_markers(N1) mset1 = self.create_snp_markers_set(lvs) mset1.load_markers() aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1) for i, m in enumerate(mset1.markers)] for i in range(M1): aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0) self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action) lvs = self.create_markers(N2) mset2 = self.create_snp_markers_set(lvs) mset2.load_markers() aligns = [(m[0], a[1], a[2], a[3], a[4], a[5]) for m, a in it.izip(mset2.markers, aligns[:len(mset2)])] for i in range(M2): aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0) self.kb.align_snp_markers_set(mset2, ref_genome, aligns, self.action) mset1.load_alignments(ref_genome) mset2.load_alignments(ref_genome) idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1) self.assertTrue(np.array_equal(idx1, idx2)) self.assertEqual(len(idx1), len(mset1)) self.assertEqual(len(idx1), N1) idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset2) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(len(idx1), N2 - max(M1, M2)) for i,j in it.izip(idx1, idx2): m1, m2 = mset1[i], mset2[j] self.assertEqual(m1.position, m2.position) self.assertTrue(m1.position > (0,0)) def test_speed(self): ref_genome = 'g' + ('%f' % time.time())[-14:] N1 = 1024*1024 N2 = N1/2 beg = time.time() lvs = self.create_markers(N1) print '' print 'creating %d markers took %f' % (N1, time.time() - beg) beg = time.time() mset1 = self.create_snp_markers_set(lvs) print 'creating a markers set with %d markers took %f' % (N1, time.time() - beg) beg = time.time() mset1.load_markers() print 'loading markers took %f' % (time.time() - beg) beg = time.time() aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1) for i, m in enumerate(mset1.markers)] print 'creating %d aligns took %f' % (N1, time.time() - beg) beg = time.time() self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action) print 'saving %d aligns took %f' % (N1, time.time() - beg) beg = time.time() mset1.load_alignments(ref_genome) print 'loading %d aligns took %f' % (N1, time.time() - beg) beg = time.time() idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1) print 'intersecting %d aligns took %f' % (N1, time.time() - beg) def test_speed_gdo(self): N = 934968 beg = time.time() lvs = self.create_markers(N) print '' print 'creating %d markers took %f' % (N, time.time() - beg) mset = self.create_snp_markers_set(lvs) beg = time.time() mset.load_markers() print 'loading %d markers took %f' % (N, time.time() - beg) beg = time.time() data_sample = self.create_data_sample(mset, 'foo-data') print 'creating a data sample took %f' % (time.time() - beg) beg = time.time() probs, confs = self.create_data_object(data_sample) print 'creating a data object took %f' % (time.time() - beg) beg = time.time() probs1, confs1 = data_sample.resolve_to_data() print 'resolving to data took %f' % (time.time() - beg) self.assertTrue((probs == probs1).all()) self.assertTrue((confs == confs1).all()) beg = time.time() s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample]) for i, x in enumerate(s): self.assertTrue((probs == x['probs']).all()) self.assertTrue((confs == x['confidence']).all()) self.assertEqual(i, 0) print 'iterating took %f' % (time.time() - beg) indices = slice(N/4, N/2) beg = time.time() s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample], indices=indices) for i, x in enumerate(s): self.assertTrue((probs[:,indices] == x['probs']).all()) self.assertTrue((confs[indices] == x['confidence']).all()) self.assertEqual(i, 0) print 'iterating with indices took %f' % (time.time() - beg)