def histo_features(runh, refh, const_std_features=True): result = pd.Series() try: (rund, run_bars), (refd, ref_bars) = map(make_distribution, [runh, refh]) except ValueError: return Maybe(error_message=["Integral for histogram equals zero"]) runf = distribution_features(rund) reff = distribution_features(refd) distancef = statistical_distance(refd, rund) funcf = reff - runf flavors = [(runf, 'run'), (distancef, 'distance'), (funcf, 'func')] if const_std_features: flavors.append((reff, 'ref')) for features, flavor in flavors: result = result.append(rename_features(features, flavor)) if run_bars is not None: result = result.append( pd.Series({ ('alarm', 'max_error_ratio'): max_error_ratio(run_bars, ref_bars) })) return Maybe(value=result)
def process_tfile(tfile, histo_keys, handle): contents = {} for histo_key in histo_keys: t = try_get_object(tfile, histo_key) typ = type(t) if typ not in HANDLERS: val = Maybe(error_message=[ 'Error while processing .root. No handler available for {} of {}' .format(typ, handle) ]) else: handled = HANDLERS[typ](t) if handled is None: val = Maybe(error_message=[ 'Error while processing .root. No data for {}'.format( handle) ]) else: val = Maybe(value=np_histo(handled)) contents[histo_key] = val return contents
def tprofile_features(runh, refh): distance = Maybe(value=rename_features( regression_distance(runh['vals'], refh['vals']), 'distance')) alarm = Maybe(value=rename_features( pd.Series({ 'max_abs': np.max(np.abs(runh['vals'])), 'max_error_ratio': max_error_ratio(runh, refh), }), 'alarm')) result = Maybe.concat([ distance, alarm, value_features(runh['entries'], refh['entries']), histo_features({'vals': runh['errs']}, {'vals': refh['errs']}) ]) return result
def get_features(self, run_number): refh = self.collector.get_reference(run_number) runh = self.collector.get_run(run_number) present = Maybe.concat_id([runh, refh]) if present.is_error(): return FeatureContainer(Flag.BAD, present.error_message) result = FeatureContainer() for histo_key in self.collector.get_histo_keys(): histo_present = Maybe.concat_id( [a[histo_key] for a in present.value]) if histo_present.is_error(): result.add_histo_errors(histo_key, histo_present.error_message) result.set_flag(Flag.TAIL) else: runh, refh = histo_present.value ht = self.get_histo_type(histo_key, runh) # !!!!!!!!!!!!!!!!!!!!!!!!!!! if ht == 'TH2D': continue handler = FeatureExtractor.HANDLERS[ht] runh, refh = map(partial(self.tune_histo, ht), [runh, refh]) features = handler(runh, refh) if features.is_error(): result.add_histo_errors(histo_key, features.error_message) result.set_flag(Flag.TAIL) else: renamed = rename_features(features.value, ('my', ht, histo_key)) result.add_features(renamed) if not result.has_features(): result.add_errors(['No histo features at all.']) result.set_flag(Flag.BAD) return result
def weird_tprofile_features(runh, refh): mask = (runh['vals'] != 0) & (refh['vals'] != 0) s = mask.sum() if s < 3: return Maybe( error_message=['Too many zeros in graph to process features']) else: for d in [runh, refh]: for key, value in d.items(): if isinstance(value, np.ndarray): d[key] = value[mask] return Maybe.concat([ Maybe(value=pd.Series({('zeros', 'vals'): s})), tprofile_features(runh, refh) ])
def tefficiency_features(runh, refh): (run_eff, run_stats), (ref_eff, ref_stats) = map(get_efficiency, [runh, refh]) return Maybe.concat([ histo_features(run_eff, ref_eff), value_features(run_stats, ref_stats), value_features(run_eff['vals'], ref_eff['vals'], 'eff') ])
def process_run(arg): self, histo_keys, run_number, handle, reference = arg getter = self.external_collector.get_reference_tfile if reference \ else self.external_collector.get_run_tfile writer = partial(self.collector.write_root2py, reference) tfile = getter(run_number) if tfile is None: val = Maybe(error_message=[ "External collector doesn't give any .root for the handle" " {} of run {}".format(handle, run_number) ]) else: val = Maybe(value=process_tfile(tfile, histo_keys, handle)) writer(val, handle)
def value_features(runv, refv, prefix='stats'): runs, refs = runv.sum(), refv.sum() features = pd.Series({ 'runs': runs, 'refs': refs, 'refs - runs': refs - runs, }) result = pd.concat([features, regression_distance(runv, refv)]) return Maybe(value=rename_features(result, prefix))
def make_result(self, result, error_on_fail): return Maybe( error_message=[error_on_fail]) if result is None else result
class FeatureExtractor: HANDLERS = { 'TH1D': th1d_features, 'TH2D': lambda runh, refh: Maybe( error_message=['No handler for TH2D available']), # it's sad 'TEfficiency': tefficiency_features, 'TProfile': tprofile_features, 'WeirdTProfile': weird_tprofile_features, 'WeirdMuTProfile': weird_tprofile_features, 'DecodingErrors': decoding_errors_features } WEIRD_HT = { 'WeirdTProfile': [ 'RICH/RiLongTrkEff/All/effVChi2PDOF', 'Velo/VeloTrackMonitor/Pseudoefficiency_per_sensor_vs_sensorID', ], 'WeirdMuTProfile': ['MuIDLambdaPlot/pion/Prof_eff', 'MuIDLambdaPlot/proton/Prof_eff'], 'DecodingErrors': ['RICH/RichDecodingErrors/decodingErrors'] } NUMS = [ 'avHltPhysRate', 'avL0PhysRate', 'avLumi', 'avMu', 'avPhysDeadTime', 'beamenergy', 'beamgasTrigger', 'betaStar', 'endlumi', 'lumiTrigger', 'magnetCurrent', 'nobiasTrigger', 'partitionid', 'run_state', 'tck', 'veloOpening' ] CATEGORICAL = [ 'LHCState', 'activity', 'magnetState', 'partitionname', 'program', 'programVersion', 'runtype', 'state', 'triggerConfiguration', 'veloPosition', 'destination' ] TIME = ['starttime', 'endtime'] def __init__(self, collector, njobs): self.collector = collector self.njobs = njobs def tune_histo(self, ht, runh): runh = deepcopy(runh) if ht == 'TH1D': runh['vals'] = np.maximum(0, runh['vals']) return runh def get_histo_type(self, histo_key, data): histo_types = inverse_dict(FeatureExtractor.WEIRD_HT) return histo_types.get(histo_key, data['type']) def get_features(self, run_number): refh = self.collector.get_reference(run_number) runh = self.collector.get_run(run_number) present = Maybe.concat_id([runh, refh]) if present.is_error(): return FeatureContainer(Flag.BAD, present.error_message) result = FeatureContainer() for histo_key in self.collector.get_histo_keys(): histo_present = Maybe.concat_id( [a[histo_key] for a in present.value]) if histo_present.is_error(): result.add_histo_errors(histo_key, histo_present.error_message) result.set_flag(Flag.TAIL) else: runh, refh = histo_present.value ht = self.get_histo_type(histo_key, runh) # !!!!!!!!!!!!!!!!!!!!!!!!!!! if ht == 'TH2D': continue handler = FeatureExtractor.HANDLERS[ht] runh, refh = map(partial(self.tune_histo, ht), [runh, refh]) features = handler(runh, refh) if features.is_error(): result.add_histo_errors(histo_key, features.error_message) result.set_flag(Flag.TAIL) else: renamed = rename_features(features.value, ('my', ht, histo_key)) result.add_features(renamed) if not result.has_features(): result.add_errors(['No histo features at all.']) result.set_flag(Flag.BAD) return result def make_features(self, run_numbers): args = zip([self] * len(run_numbers), run_numbers) # for arg in args: # process_run(arg) pool = Pool(self.njobs) pool.map(process_run, args) pool.close() pool.join() def get_linear_data(self): linear_data = self.collector.get_linear_data() df = pd.DataFrame.from_dict(linear_data, orient='index').drop('rundb_data', axis=1) df.index = df.index.astype(np.int) rundb = { key: value['rundb_data'] for key, value in linear_data.items() } rundf = pd.DataFrame.from_dict(rundb, orient='index') rundf.index = rundf.index.astype(np.int) rundf = rundf[FeatureExtractor.NUMS + FeatureExtractor.CATEGORICAL + FeatureExtractor.TIME] for col in FeatureExtractor.CATEGORICAL: rundf[col] = LabelEncoder().fit_transform(rundf[col]) for col in FeatureExtractor.NUMS: rundf[col] = rundf[col].astype(np.float64) for col in FeatureExtractor.TIME: rundf[col] = rundf[col].map(get_time) rundf['run_length'] = rundf['endtime'] - rundf['starttime'] rundf.loc[rundf['run_length'] < 0, 'run_length'] = np.nan df = rundf.merge(df, left_index=True, right_index=True) df['reference'] = pd.Series(df.index, index=df.index).map( self.collector.get_data_ref()) df['switch'] = (df['reference'] != df['reference'].shift(1)).astype( np.int).cumsum() df = df.rename( columns=lambda col: ('linear', col) if col != 'flag' else 'flag') return df
def decoding_errors_features(runh, refh): return Maybe(value=pd.Series({('decoding', 'errors'): runh['vals'].sum()}))
def th1d_features(runh, refh): return Maybe.concat([ histo_features(runh, refh), value_features(runh['vals'], refh['vals']), Maybe(value=pd.Series({('alarm', 'mean'): runh['mean']})) ])