def dump_gender_check(self, ofile): dt = DependencyTree(self.kb) data_samples, ds_to_do = self.pre_load_data() fnames = 'individual_id gender path mimetype size sha1'.split() tsv = csv.DictWriter(ofile, fnames, delimiter='\t', lineterminator=os.linesep) tsv.writeheader() for ds in data_samples: v = dt.get_connected(ds) i = filter(lambda x: type(x) == self.kb.Individual, v)[0] if ds_to_do.has_key(ds.id): for do in ds_to_do[ds.id]: r = { 'individual_id': i.id, 'gender': self.gender_str(i.gender), 'path': do.path, 'mimetype': do.mimetype, 'size': do.size, 'sha1': do.sha1, } tsv.writerow(r) else: self.logger.warn('there is no DataObject for %s[%s]' % (ds.label, ds.id))
def dump(self, study_label, ofile): if study_label is None: individuals = self.kb.get_objects(self.kb.Individual) study_label = "_ALL_" else: study = self.kb.get_study(study_label) if not study: msg = 'study %s does not exist' % study_label self.logger.critical(msg) raise ValueError(msg) individuals = [e.individual for e in self.kb.get_enrolled(study)] dt = DependencyTree(self.kb) self.dump_individuals(dt, study_label, individuals, ofile)
def dump(self, args): self.ots = None self._field_names = [] self.logger.info('start loading dependency tree') dt = DependencyTree(self.kb) self.logger.info('done loading dependency tree') def writeheader(*field_names): self._field_names = field_names self.ots = csv.DictWriter(args.ofile, self._field_names, delimiter='\t') self.ots.writeheader() def writerow(*field_values): d = dict(zip(self._field_names, field_values)) self.ots.writerow(d) def Individuals(group): return self.kb.get_individuals(group) def DataSamples(individual, data_sample_klass_name='DataSample'): klass = getattr(self.kb, data_sample_klass_name) return dt.get_connected(individual, aklass=klass) def DataObjects(data_sample): q = """select o from DataObject as o join fetch o.sample as s where s.id = :sid""" return self.kb.find_all_by_query(q, {'sid': data_sample.omero_id}) def enum_label(x): if isinstance(x, self.kb.Gender): if x == self.kb.Gender.MALE: return 'MALE' if x == self.kb.Gender.FEMALE: return 'FEMALE' code = args.code_file.read() group = self.kb.get_study(args.group) ccode = compile(code, '<string>', 'exec') exec ccode in locals()
def dump_call_gt(self, ofile): if not self.data_collection: raise ValueError('data_collection %s is not known to KB' % self.data_collection) dt = DependencyTree(self.kb) data_samples, ds_to_do = self.pre_load_data() fnames = [ 'dc_id', 'item_id', 'data_sample_label', 'path', 'gender', 'mimetype', 'size', 'sha1' ] tsv = csv.DictWriter(ofile, fnames, delimiter='\t', lineterminator=os.linesep) tsv.writeheader() dc_id = self.data_collection.id for ds in data_samples: v = dt.get_connected(ds, self.kb.Individual) assert len(v) == 1 i = v[0] if ds_to_do.has_key(ds.id): for do in ds_to_do[ds.id]: r = { 'dc_id': dc_id, 'data_sample_label': do.sample.label, 'item_id': do.sample.id, 'gender': self.gender_str(i.gender), 'path': do.path, 'mimetype': do.mimetype, 'size': do.size, 'sha1': do.sha1, } tsv.writerow(r) else: self.logger.warn('there is no DataObject for %s[%s]' % (ds.label, ds.id))
def dump(self, args, ots): if args.seed is None: random.seed() else: random.seed(args.seed) if args.study is None: individuals = self.kb.get_objects(self.kb.Individual) else: study = self.kb.get_study(args.study) if not study: self.__critical('study %s does not exist' % args.study) individuals = [e.individual for e in self.kb.get_enrolled(study)] if args.required_datasample: tech = getattr(self.kb, args.required_datasample) self.logger.info('start loading dependency tree') dt = DependencyTree(self.kb) self.logger.info('done loading dependency tree') else: tech = None atype_control = 'openEHR-EHR-EVALUATION.exclusion-problem_diagnosis.v1' atype_diagnosis = 'openEHR-EHR-EVALUATION.problem-diagnosis.v1' field = 'at0002.1' male_controls = [] male_affected = [] female_controls = [] female_affected = [] self.logger.debug('reference_disease: %s' % args.reference_disease) for i in individuals: if tech: v = dt.get_connected(i, tech) if not v: continue ehr = self.kb.get_ehr(i) self.logger.debug('ehr: %s' % ehr.recs) if ehr.matches(atype_control): if i.gender == self.kb.Gender.MALE: male_controls.append(i) else: female_controls.append(i) elif ehr.matches(atype_diagnosis, field, args.reference_disease): if i.gender == self.kb.Gender.MALE: male_affected.append(i) else: female_affected.append(i) # TODO add checks for round-off effects total_affected = int((1.0 - args.control_fraction) * args.total_number) total_male_affected = int(total_affected * args.male_fraction) total_female_affected = int(total_affected * (1.0 - args.male_fraction)) total_controls = int(args.control_fraction * args.total_number) total_male_controls = int(total_controls * args.male_fraction) total_female_controls = int(total_controls * (1.0 - args.male_fraction)) if total_male_affected > len(male_affected): self.__critical('Requested %d affected males out of %d' % (total_male_affected, len(male_affected))) if total_female_affected > len(female_affected): self.__critical('Requested %d affected females out of %d' % (total_female_affected, len(female_affected))) if total_male_controls > len(male_controls): self.__critical('Requested %d controls males out of %d' % (total_male_controls, len(male_controls))) if total_female_controls > len(female_controls): self.__critical('Requested %d controls females out of %d' % (total_female_controls, len(female_controls))) s_male_controls = random.sample(male_controls, total_male_controls) self.logger.info('selected %d male controls out of %d' % (len(s_male_controls), len(male_controls))) s_female_controls = random.sample(female_controls, total_female_controls) self.logger.info('selected %d female controls out of %d' % (len(s_female_controls), len(female_controls))) s_male_affected = random.sample(male_affected, total_male_affected) self.logger.info('selected %d male affected out of %d' % (len(s_male_affected), len(male_affected))) s_female_affected = random.sample(female_affected, total_female_affected) self.logger.info('selected %d female affected out of %d' % (len(s_female_affected), len(female_affected))) all_individuals = (s_male_controls + s_female_controls + s_male_affected + s_female_affected) for i, indy in enumerate(all_individuals): ots.writerow({ 'group': args.group_label, 'group_code': 'I%04d' % i, 'individual': indy.id, })
def update_dependency_tree(self): self.dt = DependencyTree(self)