class App(object): def __init__(self, host, user, passwd, study_label, maker, model, release): self.kb = KB(driver='omero')(host, user, passwd) self.mset = self.kb.get_snp_markers_set(maker, model, release) self.logger = logging.getLogger() if not self.mset: raise ValueError('SNPMarkersSet[%s,%s,%s] has not been defined.' % (maker, model, release)) #-- alabel = 'load_genotypes-setup-%s' % time.time() self.asetup = self.kb.factory.create(self.kb.ActionSetup, {'label' : alabel, 'conf' : ''}).save() #-- dmaker, dmodel, drelease = 'CRS4', 'load_genotypes', '0.1' dlabel = '%s-%s-%s' % (dmaker, dmodel, drelease) device = self.kb.get_device(dlabel) if not device: device = self.kb.factory.create(self.kb.Device, {'label' : dlabel, 'maker' : dmaker, 'model' : dmodel, 'release' : drelease}).save() self.device = device #-- FIXME this will break if study is not defined. self.study = self.kb.get_study(study_label) def check_snp_markers_set(self, marker_types, marker_names): self.logger.info('start checking snp_markers_set') mdefs, msetc = self.kb.get_snp_markers_set_content(self.mset) rs_labels = mdefs['rs_label'] for t, n in it.izip(marker_types, marker_names): if t == 'M': if not n in rs_labels: msg = 'marker %s is not in the specified SNPMarkersSet' % n self.logger.critical(msg) raise ValueError(msg) self.logger.info('done checking snp_markers_set') def create_action(self, target): conf = {'setup' : self.asetup, 'device' : self.device, 'actionCategory' : self.kb.ActionCategory.MEASUREMENT, 'operator' : 'Alfred E. Neumann', 'context' : self.study, 'target' : target, } action = self.kb.factory.create(self.kb.ActionOnVessel, conf).save() return action def create_data_sample(self, action, label): conf = {'snpMarkersSet' : self.mset, 'label' : label, 'status' : self.kb.DataSampleStatus.USABLE, 'action' : action} return self.kb.factory.create(self.kb.GenotypeDataSample, conf).save() def load(self, pedfile, datfile, conf_value=1.0): pr = PedReader(pedfile, datfile, conf_value) self.check_snp_markers_set(pr.marker_types, pr.marker_names) #-- self.logger.info('start loading from pedfile %s' % pedfile.name) for x in pr: sample = self.kb.get_vessel(x['sample_label']) if not sample: self.logger.error('No sample with label %s in VL' % x['sample_label']) continue action = self.create_action(sample) avid = action.id action.unload() data_sample = self.create_data_sample(action, x['label']) data_object = self.kb.add_gdo_data_object(avid, data_sample, x['probs'], x['confs']) self.logger.info('-- loaded %s' % x['label']) self.logger.info('done loading from pedfile %s' % pedfile.name)
""" mset.load_markers() kb.update_snp_positions(mset.markers, ref_genome) data_sample_by_id = {} family = [] for i, ind in enumerate(kb.get_individuals(study)): family.append(ind) action = kb.create_an_action(study, target=ind, doc='fake dataset') conf = {'label' : 'taq-%03d' % i, 'status' : kb.DataSampleStatus.USABLE, 'action' : action, 'snpMarkersSet' : mset} data_sample = kb.factory.create(kb.GenotypeDataSample, conf).save() probs, conf = make_fake_data(mset) do = kb.add_gdo_data_object(action, data_sample, probs, conf) data_sample_by_id[ind.id] = data_sample """ .. Note how we first create a DataSample object (GenotypeDataSample) which basically keeps track of the fact that there exists a genotyping data set defined on a given snp markers set, and then we provide an actual data object that describes the physical object that contains the real data. The idea is that there could be many instances, data equivalent, that link to the same DataSample, e.g., on different file systems, in different formats and so on. """ """ ..
class App(object): def __init__(self, host, user, passwd, study_label, maker, model, release): self.kb = KB(driver='omero')(host, user, passwd) self.mset = self.kb.get_snp_markers_set(maker, model, release) self.logger = logging.getLogger() if not self.mset: raise ValueError('SNPMarkersSet[%s,%s,%s] has not been defined.' % (maker, model, release)) #-- alabel = 'load_genotypes-setup-%s' % time.time() self.asetup = self.kb.factory.create(self.kb.ActionSetup, { 'label': alabel, 'conf': '' }).save() #-- dmaker, dmodel, drelease = 'CRS4', 'load_genotypes', '0.1' dlabel = '%s-%s-%s' % (dmaker, dmodel, drelease) device = self.kb.get_device(dlabel) if not device: device = self.kb.factory.create( self.kb.Device, { 'label': dlabel, 'maker': dmaker, 'model': dmodel, 'release': drelease }).save() self.device = device #-- FIXME this will break if study is not defined. self.study = self.kb.get_study(study_label) def check_snp_markers_set(self, marker_types, marker_names): self.logger.info('start checking snp_markers_set') mdefs, msetc = self.kb.get_snp_markers_set_content(self.mset) rs_labels = mdefs['rs_label'] for t, n in it.izip(marker_types, marker_names): if t == 'M': if not n in rs_labels: msg = 'marker %s is not in the specified SNPMarkersSet' % n self.logger.critical(msg) raise ValueError(msg) self.logger.info('done checking snp_markers_set') def create_action(self, target): conf = { 'setup': self.asetup, 'device': self.device, 'actionCategory': self.kb.ActionCategory.MEASUREMENT, 'operator': 'Alfred E. Neumann', 'context': self.study, 'target': target, } action = self.kb.factory.create(self.kb.ActionOnVessel, conf).save() return action def create_data_sample(self, action, label): conf = { 'snpMarkersSet': self.mset, 'label': label, 'status': self.kb.DataSampleStatus.USABLE, 'action': action } return self.kb.factory.create(self.kb.GenotypeDataSample, conf).save() def load(self, pedfile, datfile, conf_value=1.0): pr = PedReader(pedfile, datfile, conf_value) self.check_snp_markers_set(pr.marker_types, pr.marker_names) #-- self.logger.info('start loading from pedfile %s' % pedfile.name) for x in pr: sample = self.kb.get_vessel(x['sample_label']) if not sample: self.logger.error('No sample with label %s in VL' % x['sample_label']) continue action = self.create_action(sample) avid = action.id action.unload() data_sample = self.create_data_sample(action, x['label']) data_object = self.kb.add_gdo_data_object(avid, data_sample, x['probs'], x['confs']) self.logger.info('-- loaded %s' % x['label']) self.logger.info('done loading from pedfile %s' % pedfile.name)
class markers_set(unittest.TestCase): def __init__(self, name): super(markers_set, self).__init__(name) self.kill_list = [] def setUp(self): self.kb = KB(driver='omero')(OME_HOST, OME_USER, OME_PASSWD) conf = { 'label': 'TEST-%f' % time.time(), 'description': 'unit test garbage', } self.study = self.kb.factory.create(self.kb.Study, conf).save() self.kill_list.append(self.study) self.action = self.kb.create_an_action(self.study) self.kill_list.append(self.action) def tearDown(self): self.kill_list.reverse() for x in self.kill_list: self.kb.delete(x) self.kill_list = [] def create_markers(self, N): def marker_generator(): for i in range(N): label = 'A%f-%d' % (time.time(), i) yield (label, label, 'ACCA[A/B]TACCA') source, context, release = 'unit_testing', 'markers_set', '%f' % time.time() ref_rs_genome, dbsnp_build = 'foo-rs-genome', 123000 lvs = self.kb.create_markers(source, context, release, ref_rs_genome, dbsnp_build, marker_generator(), self.action) return lvs def create_snp_markers_set(self, lvs): label = 'ams-%f' % time.time() maker, model, release = 'FOO', 'FOO1', '%f' % time.time() markers_selection = [(v[1], i, False) for i, v in enumerate(lvs)] mset = self.kb.create_snp_markers_set(label, maker, model, release, len(lvs), markers_selection, self.action) self.kill_list.append(mset) return mset def create_alignments(self, mset, ref_genome, n_duplicates): mset.load_markers() self.assertTrue(len(mset) > 0) n_aligns = len(mset.markers) + n_duplicates pos = [] def insert_duplicates(markers): count = 0 for i, m in enumerate(markers): n_copies = 1 if count < n_duplicates: count += 1 n_copies = 2 r = (m[0], random.randint(1,26), 22 + i*1000, True, 'A' if (i%2)== 0 else 'B', n_copies) yield r r = (m[0], random.randint(1,26), 1 + i*1000, True, 'A' if (i%2)== 0 else 'B', n_copies) pos.append((0,0) if n_copies > 1 else (r[1], r[2])) yield r aligns = [x for x in insert_duplicates(mset.markers)] random.shuffle(aligns) self.kb.align_snp_markers_set(mset, ref_genome, aligns, self.action) return pos def create_data_sample(self, mset, label): conf = { 'label': label, 'status': self.kb.DataSampleStatus.USABLE, 'action': self.action, 'snpMarkersSet': mset, } data_sample = self.kb.factory.create(self.kb.GenotypeDataSample, conf).save() self.kill_list.append(data_sample) return data_sample def create_data_object(self, data_sample, add_nan=False): probs, confs = make_fake_data(data_sample.snpMarkersSet, add_nan) do = self.kb.add_gdo_data_object(self.action, data_sample, probs, confs) self.kill_list.append(do) return probs, confs def create_aligned_mset(self, N, N_dups, ref_genome): lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) pos = self.create_alignments(mset, ref_genome, N_dups) return mset, pos def test_creation_destruction(self): N = 32 lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) mset.load_markers() self.assertEqual(len(mset), N) for lv, m in it.izip(lvs, mset.markers): self.assertEqual(lv[1], m[0]) def test_align(self): N = 16 N_dups = 4 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_alignments(ref_genome) for p, m in it.izip(pos, mset.get_markers_iterator()): self.assertEqual(p, m.position) def test_read_ssc(self): N = 16 N_dups = 4 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_markers(additional_fields=['label']) probs, confs = make_fake_data(mset) sample_id = 'ffoo-%f' % time.time() fn = tempfile.NamedTemporaryFile().name make_fake_ssc(mset, sample_id, probs, confs, fn) probs_1, confs_1 = gio.read_ssc(fn, mset) self.assertAlmostEqual(np.sum(np.abs(probs - probs_1)), 0.0) self.assertAlmostEqual(np.sum(np.abs(confs - confs_1)), 0.0) def test_gdo(self): N = 32 lvs = self.create_markers(N) mset = self.create_snp_markers_set(lvs) mset.load_markers() data_sample = self.create_data_sample(mset, 'foo-data') probs, confs = self.create_data_object(data_sample) probs1, confs1 = data_sample.resolve_to_data() self.assertTrue((probs == probs1).all()) self.assertTrue((confs == confs1).all()) s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample]) for i, x in enumerate(s): self.assertTrue((probs == x['probs']).all()) self.assertTrue((confs == x['confidence']).all()) self.assertEqual(i, 0) indices = slice(N/4, N/2) s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample], indices=indices) for i, x in enumerate(s): self.assertTrue((probs[:,indices] == x['probs']).all()) self.assertTrue((confs[indices] == x['confidence']).all()) self.assertEqual(i, 0) def test_define_range_selector(self): N, N_dups = 16, 0 ref_genome = 'g' + ('%f' % time.time())[-14:] mset, pos = self.create_aligned_mset(N, N_dups, ref_genome) mset.load_alignments(ref_genome) pos.sort() if len(pos) > 2: low_pos, high_pos = pos[1], pos[-2] gc_range = (low_pos, high_pos) range_sel = self.kb.SNPMarkersSet.define_range_selector(mset, gc_range) i = 0 for (i, m) in enumerate(mset.get_markers_iterator()): if i in range_sel: self.assertTrue(low_pos <= m.position <= high_pos) else: self.assertTrue(low_pos > m.position or high_pos < m.position) def test_intersect(self): ref_genome = 'g' + ('%f' % time.time())[-14:] N1 = 16 M1 = 2 N2 = N1/2 M2 = 1 lvs = self.create_markers(N1) mset1 = self.create_snp_markers_set(lvs) mset1.load_markers() aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1) for i, m in enumerate(mset1.markers)] for i in range(M1): aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0) self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action) lvs = self.create_markers(N2) mset2 = self.create_snp_markers_set(lvs) mset2.load_markers() aligns = [(m[0], a[1], a[2], a[3], a[4], a[5]) for m, a in it.izip(mset2.markers, aligns[:len(mset2)])] for i in range(M2): aligns[i] = (aligns[i][0], 0, 0, True, 'A', 0) self.kb.align_snp_markers_set(mset2, ref_genome, aligns, self.action) mset1.load_alignments(ref_genome) mset2.load_alignments(ref_genome) idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1) self.assertTrue(np.array_equal(idx1, idx2)) self.assertEqual(len(idx1), len(mset1)) self.assertEqual(len(idx1), N1) idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset2) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(len(idx1), N2 - max(M1, M2)) for i,j in it.izip(idx1, idx2): m1, m2 = mset1[i], mset2[j] self.assertEqual(m1.position, m2.position) self.assertTrue(m1.position > (0,0)) def test_speed(self): ref_genome = 'g' + ('%f' % time.time())[-14:] N1 = 1024*1024 N2 = N1/2 beg = time.time() lvs = self.create_markers(N1) print '' print 'creating %d markers took %f' % (N1, time.time() - beg) beg = time.time() mset1 = self.create_snp_markers_set(lvs) print 'creating a markers set with %d markers took %f' % (N1, time.time() - beg) beg = time.time() mset1.load_markers() print 'loading markers took %f' % (time.time() - beg) beg = time.time() aligns = [(m[0], random.randint(1,26), 1 + i*2000, True, 'A', 1) for i, m in enumerate(mset1.markers)] print 'creating %d aligns took %f' % (N1, time.time() - beg) beg = time.time() self.kb.align_snp_markers_set(mset1, ref_genome, aligns, self.action) print 'saving %d aligns took %f' % (N1, time.time() - beg) beg = time.time() mset1.load_alignments(ref_genome) print 'loading %d aligns took %f' % (N1, time.time() - beg) beg = time.time() idx1, idx2 = self.kb.SNPMarkersSet.intersect(mset1, mset1) print 'intersecting %d aligns took %f' % (N1, time.time() - beg) def test_speed_gdo(self): N = 934968 beg = time.time() lvs = self.create_markers(N) print '' print 'creating %d markers took %f' % (N, time.time() - beg) mset = self.create_snp_markers_set(lvs) beg = time.time() mset.load_markers() print 'loading %d markers took %f' % (N, time.time() - beg) beg = time.time() data_sample = self.create_data_sample(mset, 'foo-data') print 'creating a data sample took %f' % (time.time() - beg) beg = time.time() probs, confs = self.create_data_object(data_sample) print 'creating a data object took %f' % (time.time() - beg) beg = time.time() probs1, confs1 = data_sample.resolve_to_data() print 'resolving to data took %f' % (time.time() - beg) self.assertTrue((probs == probs1).all()) self.assertTrue((confs == confs1).all()) beg = time.time() s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample]) for i, x in enumerate(s): self.assertTrue((probs == x['probs']).all()) self.assertTrue((confs == x['confidence']).all()) self.assertEqual(i, 0) print 'iterating took %f' % (time.time() - beg) indices = slice(N/4, N/2) beg = time.time() s = self.kb.get_gdo_iterator(mset, data_samples=[data_sample], indices=indices) for i, x in enumerate(s): self.assertTrue((probs[:,indices] == x['probs']).all()) self.assertTrue((confs[indices] == x['confidence']).all()) self.assertEqual(i, 0) print 'iterating with indices took %f' % (time.time() - beg)
mset.load_markers() kb.update_snp_positions(mset.markers, ref_genome) data_sample_by_id = {} family = [] for i, ind in enumerate(kb.get_individuals(study)): family.append(ind) action = kb.create_an_action(study, target=ind, doc='fake dataset') conf = { 'label': 'taq-%03d' % i, 'status': kb.DataSampleStatus.USABLE, 'action': action, 'snpMarkersSet': mset } data_sample = kb.factory.create(kb.GenotypeDataSample, conf).save() probs, conf = make_fake_data(mset) do = kb.add_gdo_data_object(action, data_sample, probs, conf) data_sample_by_id[ind.id] = data_sample """ .. Note how we first create a DataSample object (GenotypeDataSample) which basically keeps track of the fact that there exists a genotyping data set defined on a given snp markers set, and then we provide an actual data object that describes the physical object that contains the real data. The idea is that there could be many instances, data equivalent, that link to the same DataSample, e.g., on different file systems, in different formats and so on. """ """ .. As an example, we will now write out the information we have just saved as a plink pedfile.