def record(self, records, otsv, rtsv, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset+batch_size] offset += batch_size if len(records) == 0: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) study = self.find_study(records) self.source_klass = self.find_source_klass(records) self.device_klass = self.find_device_klass(records) self.preload_scanners() self.preload_devices() self.preload_sources() self.preload_markers_sets() self.preload_data_samples() records, bad_records = self.do_consistency_checks(records) for br in bad_records: rtsv.writerow(br) if blocking_validation and len(bad_records) >= 1: raise core.ImporterValidationError('%d invalid records' % len(bad_records)) for i, c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(otsv, c, study) self.logger.info('done processing chunk %d' % i)
def record(self, records, otsv, rtsv, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset + batch_size] offset += batch_size if not records: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) self.container_klass = self.find_container_klass(records) self.preload_containers() if self.container_klass == self.kb.Lane: self.preload_flowcells() records, bad_records = self.do_consistency_checks(records) for br in bad_records: rtsv.writerow(br) if blocking_validation and len(bad_records) >= 1: raise core.ImporterValidationError('%d invalid records' % len(bad_records)) study = self.find_study(records) device = self.get_device(label='importer-%s.titer_plate' % version, maker='CRS4', model='importer', release=version) act_setups = set( Recorder.get_action_setup_options(r, self.action_setup_conf) for r in records) self.logger.debug('Action setups:\n%r' % act_setups) actions = {} for acts in act_setups: acts_label = 'import-prog-%f' % time.time() act_setup_conf = {'label': acts_label, 'conf': acts} act_setup = self.kb.save( self.kb.factory.create(self.kb.ActionSetup, act_setup_conf)) acat = self.kb.ActionCategory.IMPORT act_conf = { 'setup': act_setup, 'device': device, 'actionCategory': acat, 'operator': self.operator, 'context': study, } act = self.kb.save(self.kb.factory.create(self.kb.Action, act_conf)) act.unload() actions[acts] = act for i, c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(otsv, c, study, actions) self.logger.info('done processing chunk %d' % i)
def implementation(logger, host, user, passwd, args): action_setup_conf = Recorder.find_action_setup_conf(args) f = csv.DictReader(args.ifile, delimiter='\t') records = [r for r in f] if len(records) == 0: msg = 'No records are going to be imported' logger.critical(msg) raise core.ImporterValidationError(msg) canonizer = core.RecordCanonizer(['study'], args) canonizer.canonize_list(records) study_label = records[0]['study'] o = csv.DictWriter(args.ofile, fieldnames=['study', 'label', 'type', 'vid'], delimiter='\t', lineterminator=os.linesep) recorder = Recorder(o, study_label, host, user, passwd, args.keep_tokens, args.batch_size, operator=args.operator, action_setup_conf=action_setup_conf, logger=logger) report_fnames = copy.deepcopy(f.fieldnames) report_fnames.append('error') report = csv.DictWriter(args.report_file, report_fnames, delimiter='\t', lineterminator=os.linesep, extrasaction='ignore') report.writeheader() records, bad_records = recorder.do_consistency_checks(records) for br in bad_records: report.writerow(br) if args.blocking_validator and len(bad_records) >= 1: args.ofile.close() args.ifile.close() args.report_file.close() msg = '%d invalid records' % len(bad_records) recorder.logger.critical(msg) raise core.ImporterValidationError(msg) by_label = make_ind_by_label(records) import_pedigree(recorder, by_label.itervalues()) recorder.clean_up() args.ofile.close() args.ifile.close() args.report_file.close()
def record(self, records, rtsv): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset + batch_size] offset += batch_size if not records: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) self.preload_individuals() self.preload_birth_data_records() self.preload_locations() records, bad_records = self.do_consistency_checks(records) for br in bad_records: rtsv.writerow(br) study = self.find_study(records) device_label = 'importer.birth_data-%s' % (version) device = self.get_device(label=device_label, maker='CRS4', model='importer', release=version) asetup = self.get_action_setup('importer.birth_data-%f' % time.time(), json.dumps(self.action_setup_conf)) for i, c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(c, study, asetup, device) self.logger.info('done processing chunk %d' % i)
def record(self, records, otsv): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset+batch_size] offset += batch_size if len(records) == 0: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) self.preload_groups() self.preload_individuals() def keyfunc(r): return r['group'] sub_records = [] records = sorted(records, key=keyfunc) for k, g in it.groupby(records, keyfunc): sub_records.append(self.do_consistency_checks(k, list(g))) records = sum(sub_records, []) records = sorted(records, key=keyfunc) for k, g in it.groupby(records, keyfunc): group_conf = {'label': k} group = self.kb.factory.create(self.kb.Study, group_conf).save() for i, c in enumerate(records_by_chunk(self.batch_size, list(g))): self.logger.info('start processing chunk %s-%d' % (k, i)) self.process_chunk(otsv, group, c) self.logger.info('done processing chunk %s-%d' % (k,i))
def record(self, records, otsv, rtsv, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset + batch_size] offset += batch_size if not records: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) study = self.find_study(records) self.source_klass = self.find_source_klass(records) self.vessel_klass = self.find_vessel_klass(records) self.preload_sources() if self.vessel_klass == self.kb.PlateWell: self.preload_plates() records, bad_records = self.do_consistency_checks(records) for br in bad_records: rtsv.writerow(br) if blocking_validation and len(bad_records) >= 1: raise core.ImporterValidationError('%d invalid records' % len(bad_records)) device = self.get_device('importer-%s.biosample' % version, 'CRS4', 'IMPORT', version) act_setups = set( Recorder.get_action_setup_options(r, self.action_setup_conf) for r in records) asetup = {} for acts in act_setups: # asetup = self.get_action_setup('import-prog-%f' % time.time(), # json.dumps(self.action_setup_conf)) setup_conf = { 'label': 'import-prog-%f' % time.time(), 'conf': acts } setup = self.kb.factory.create(self.kb.ActionSetup, setup_conf) asetup[acts] = self.kb.save(setup) for i, c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(otsv, c, study, asetup, device) self.logger.info('done processing chunk %d' % i)
def record(self, records, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset + batch_size] offset += batch_size if not records: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) self.preload_studies() records, bad_records = self.do_consistency_checks(records) for br in bad_records: self.report_stream.writerow(br) if blocking_validation and len(bad_records) >= 1: raise core.ImporterValidationError('%d invalid records' % len(bad_records)) for i, c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(c) self.logger.info('done processing chunk %d' % i)
def record(self, records, otsv, rtsv, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset:offset + batch_size] offset += batch_size def get_data_collection(label, action): if label in self.preloaded_data_collections: return self.preloaded_data_collections[label] else: dc_conf = {'label': label, 'action': action} return self.kb.factory.create(self.kb.DataCollection, dc_conf) if len(records) == 0: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) study = self.find_study(records) self.data_sample_klass = self.find_data_sample_klass(records) self.preload_data_samples() self.preload_data_collections() asetup = self.get_action_setup( 'importer.data_collection-%f' % time.time(), json.dumps(self.action_setup_conf)) device = self.get_device('importer-%s.data_collection' % version, 'CRS4', 'IMPORT', version) conf = { 'setup': asetup, 'device': device, 'actionCategory': self.kb.ActionCategory.PROCESSING, 'operator': self.operator, 'context': study, } action = self.kb.factory.create(self.kb.Action, conf).save() def keyfunc(r): return r['label'] sub_records = [] data_collections = {} records = sorted(records, key=keyfunc) for k, g in it.groupby(records, keyfunc): data_collections[k] = get_data_collection(k, action) good_records, bad_records = self.do_consistency_checks( data_collections[k], list(g)) sub_records.append(good_records) for br in bad_records: rtsv.writerow(br) if blocking_validation and len(bad_records) >= 1: self.kb.delete(action) raise core.ImporterValidationError('%d invalid records' % len(bad_records)) records = sum(sub_records, []) if len(records) == 0: self.kb.delete(action) msg = 'No records are going to be imported' self.logger.warning(msg) sys.exit(0) records = sorted(records, key=keyfunc) for k, g in it.groupby(records, keyfunc): dc = data_collections[k] if not dc.is_mapped(): dc.save() for i, c in enumerate(records_by_chunk(self.batch_size, list(g))): self.logger.info('start processing chunk %s-%d' % (k, i)) self.process_chunk(otsv, study, dc, c) self.logger.info('done processing chunk %s-%d' % (k, i))
def record(self, records, otsv, rtsv, blocking_validation): def records_by_chunk(batch_size, records): offset = 0 while len(records[offset:]) > 0: yield records[offset : offset + batch_size] offset += batch_size if len(records) == 0: msg = 'No records are going to be imported' self.logger.critical(msg) raise core.ImporterValidationError(msg) study = self.find_study(records) self.source_klass = self.find_source_klass(records) self.seq_sample_klass = self.find_seq_sample_klass(records) self.preload_sources() self.preload_devices() if self.seq_sample_klass == self.kb.RawSeqDataSample: self.preload_lanes() if self.seq_sample_klass == self.kb.SeqDataSample: self.preload_tubes() records, bad_records = self.do_consistency_checks(records) for br in bad_records: rtsv.writerow(br) if blocking_validation and len(bad_records) >= 1: raise core.ImporterValidationError('%d invalid records' % len(bad_records)) act_setups = set((r['source'], r.get('device', None), Recorder.get_action_setup_options(r, self.action_setup_conf, self.history)) for r in records) self.logger.debug('Action setups:\n%r' % act_setups) actions = {} for acts in act_setups: # TODO: if a history has been passed, add this to the options act_label = 'importer.seq_data_sample.%f' % time.time() act_setup_conf = {'label' : act_label, 'conf' : acts[2]} act_setup = self.kb.save(self.kb.factory.create(self.kb.ActionSetup, act_setup_conf)) if issubclass(self.source_klass, self.kb.FlowCell): act_klass = self.kb.ActionOnCollection act_category = self.kb.ActionCategory.MEASUREMENT elif issubclass(self.source_klass, self.kb.DataSample): act_klass = self.kb.ActionOnDataSample act_category = self.kb.ActionCategory.PROCESSING else: self.logger.error('Unmanaged source type %r' % self.source_klass) sys.exit('Unmanaged source type %r' % self.source_klass) act_conf = {'setup' : act_setup, 'actionCategory' : act_category, 'operator' : self.operator, 'context' : study, 'target' : self.preloaded_sources[acts[0]]} if acts[1]: act_conf['device'] = self.preloaded_devices[acts[1]] action = self.kb.factory.create(act_klass, act_conf) action = self.kb.save(action) # Unload the action object or it will cause a bug when # saving objects that references to ActionOnDataSample # records, too many inheritance steps action.unload() actions[acts] = action self.logger.debug('Actions are:\n%r' % actions) for i,c in enumerate(records_by_chunk(self.batch_size, records)): self.logger.info('start processing chunk %d' % i) self.process_chunk(otsv, c, actions, study) self.logger.info('done processing chunk %d' % i)