def preprocess_from_files(self,shot_files,use_shots): #all shots, including invalid ones all_signals = self.conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) #empty used_shots = ShotList() use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,shot) in enumerate(pool.imap_unordered(self.preprocess_single_file,shot_list_picked)): #for (i,shot) in enumerate(map(self.preprocess_single_file,shot_list_picked)): sys.stdout.write('\r{}/{}'.format(i,len(shot_list_picked))) used_shots.append_if_valid(shot) pool.close() pool.join() print('Finished Preprocessing {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) print('Omitted {} shots of {} total.'.format(len(shot_list_picked) - len(used_shots),len(shot_list_picked))) print('{}/{} disruptive shots'.format(used_shots.num_disruptive(),len(used_shots))) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data is complete and available at {}.".format(self.conf['paths']['signal_prepath'])) return used_shots
def train_on_files(self, shot_files, use_shots, all_machines, verbose=False): conf = self.conf all_signals = conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files, all_signals) shot_list_picked = shot_list.random_sublist(use_shots) previously_saved, machines_saved = self.previously_saved_stats() machines_to_compute = all_machines - machines_saved recompute = conf['data']['recompute_normalization'] if recompute: machines_to_compute = all_machines previously_saved = False if not previously_saved or len(machines_to_compute) > 0: if previously_saved: self.load_stats(verbose=True) print('computing normalization for machines {}'.format( machines_to_compute)) use_cores = max(1, mp.cpu_count() - 2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format( pool._processes)) start_time = time.time() for (i, stats) in enumerate( pool.imap_unordered(self.train_on_single_shot, shot_list_picked)): # for (i,stats) in # enumerate(map(self.train_on_single_shot,shot_list_picked)): if stats.machine in machines_to_compute: self.incorporate_stats(stats) self.machines.add(stats.machine) sys.stdout.write('\r' + '{}/{}'.format(i, len(shot_list_picked))) pool.close() pool.join() print( '\nFinished Training Normalizer on ', '{} files in {} seconds'.format(len(shot_list_picked), time.time() - start_time)) self.save_stats(verbose=True) else: self.load_stats(verbose=verbose) # print representation of trained Normalizer to stdout: # Machine, NormalizerName, per-signal normalization stats/params if verbose: g.print_unique(self)
def preprocess_from_files(self, shot_files, use_shots): # all shots, including invalid ones all_signals = self.conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files, all_signals) shot_list_picked = shot_list.random_sublist(use_shots) # empty used_shots = ShotList() # TODO(KGF): generalize the follwowing line to perform well on # architecutres other than CPUs, e.g. KNLs # min( <desired-maximum-process-count>, max(1,mp.cpu_count()-2) ) use_cores = max(1, mp.cpu_count() - 2) pool = mp.Pool(use_cores) print('Running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i, shot) in enumerate( pool.imap_unordered(self.preprocess_single_file, shot_list_picked)): # for (i,shot) in # enumerate(map(self.preprocess_single_file,shot_list_picked)): sys.stdout.write('\r{}/{}'.format(i, len(shot_list_picked))) used_shots.append_if_valid(shot) pool.close() pool.join() print('\nFinished preprocessing {} files in {} seconds'.format( len(shot_list_picked), time.time() - start_time)) print('Using {} shots ({} disruptive shots)'.format( len(used_shots), used_shots.num_disruptive())) print('Omitted {} shots of {} total shots'.format( len(shot_list_picked) - len(used_shots), len(shot_list_picked))) print( 'Omitted {} disruptive shots of {} total disruptive shots'.format( shot_list_picked.num_disruptive() - used_shots.num_disruptive(), shot_list_picked.num_disruptive())) if len(used_shots) == 0: print("WARNING: All shots were omitted, please ensure raw data " " is complete and available at {}.".format( self.conf['paths']['signal_prepath'])) return used_shots
def train_on_files(self,shot_files,use_shots,all_machines): conf = self.conf all_signals = conf['paths']['all_signals'] shot_list = ShotList() shot_list.load_from_shot_list_files_objects(shot_files,all_signals) shot_list_picked = shot_list.random_sublist(use_shots) previously_saved,machines_saved = self.previously_saved_stats() machines_to_compute = all_machines - machines_saved recompute = conf['data']['recompute_normalization'] if recompute: machines_to_compute = all_machines previously_saved = False if not previously_saved or len(machines_to_compute) > 0: if previously_saved: self.load_stats() print('computing normalization for machines {}'.format(machines_to_compute)) use_cores = max(1,mp.cpu_count()-2) pool = mp.Pool(use_cores) print('running in parallel on {} processes'.format(pool._processes)) start_time = time.time() for (i,stats) in enumerate(pool.imap_unordered(self.train_on_single_shot,shot_list_picked)): #for (i,stats) in enumerate(map(self.train_on_single_shot,shot_list_picked)): if stats.machine in machines_to_compute: self.incorporate_stats(stats) self.machines.add(stats.machine) sys.stdout.write('\r' + '{}/{}'.format(i,len(shot_list_picked))) pool.close() pool.join() print('Finished Training Normalizer on {} files in {} seconds'.format(len(shot_list_picked),time.time()-start_time)) self.save_stats() else: self.load_stats() print(self)