def save_temporal_encoding(train_handler, data_loader, save_rootdir, days_n: int = C_.DEFAULT_DAYS_N, **kwargs): train_handler.load_model() # important, refresh to best model train_handler.model.eval() # model eval dataset = data_loader.dataset # get dataset if not hasattr(train_handler.model, 'get_info'): return days = np.linspace(C_.DEFAULT_MIN_DAY, dataset.max_day, days_n) #[::-1] temporal_encoding_info = train_handler.model.get_info() #print('temporal_encoding_info',temporal_encoding_info) results = { 'model_name': train_handler.model.get_name(), 'survey': dataset.survey, 'band_names': dataset.band_names, 'class_names': dataset.class_names, 'days': days, 'temporal_encoding_info': temporal_encoding_info, } ### save file save_filedir = f'{save_rootdir}/{dataset.lcset_name}/id={train_handler.id}.d' files.save_pickle(save_filedir, results) # save file return
def save_model_info(train_handler, data_loader, save_rootdir, **kwargs): train_handler.load_model() # important, refresh to best model train_handler.model.eval() # model eval dataset = data_loader.dataset # get dataset results = { 'model_name':train_handler.model.get_name(), 'survey':dataset.survey, 'band_names':dataset.band_names, 'class_names':dataset.class_names, 'parameters':count_parameters(train_handler.model), 'monitors':{}, } for lmonitor in train_handler.lmonitors: results['monitors'][lmonitor.name] = { 'save_dict':lmonitor.get_save_dict(), 'best_epoch':lmonitor.get_best_epoch(), 'time_per_iteration':lmonitor.get_time_per_iteration(), #'time_per_epoch_set':{set_name:lmonitor.get_time_per_epoch_set(set_name) for set_name in ['train', 'val']}, 'time_per_epoch':lmonitor.get_time_per_epoch(), 'total_time':lmonitor.get_total_time(), } ### save file save_filedir = f'{save_rootdir}/{dataset.lcset_name}/id={train_handler.id}.d' files.save_pickle(save_filedir, results) # save file return
def save_dim_reductions(train_handler, data_loader, save_rootdir, target_is_onehot: bool = False, target_y_key='target/y', pred_y_key='model/y', days_n: int = DEFAULT_DAYS_N, random_state=RANDOM_STATE, **kwargs): train_handler.load_model() # important, refresh to best model train_handler.model.eval() # important, model eval mode dataset = data_loader.dataset # get dataset dataset.reset_max_day() # always reset max day days_embeddings = {} days_y_true = {} days = np.linspace(DEFAULT_MIN_DAY, dataset.max_day, days_n) #[::-1] bar = ProgressBar(len(days)) with torch.no_grad(): for day in days: dataset.set_max_day(day) # very important!! dataset.calcule_precomputed() # very important!! tdicts = [] for ki, in_tdict in enumerate(data_loader): _tdict = train_handler.model( TDictHolder(in_tdict).to(train_handler.device)) tdicts += [_tdict] tdict = minibatch_dict_collate(tdicts) ### class prediction y_true = tdict[target_y_key] # (b) #y_pred_p = torch.nn.functional.softmax(tdict[pred_y_key], dim=-1) # (b,c) y_pred_p = torch.sigmoid(tdict[pred_y_key]) # (b,c) #print('y_pred_p',y_pred_p[0]) if target_is_onehot: assert y_pred_.shape == y_true.shape y_true = torch.argmax(y_true, dim=-1) y_true = tensor_to_numpy(y_true) y_pred_p = tensor_to_numpy(y_pred_p) days_y_true[day] = y_true ### embeddings encz_last = tdict[f'model/encz_last'] days_embeddings[day] = tensor_to_numpy(encz_last) bar(f'day={day:.3f}/{days[-1]:.3f}; {days_embeddings[day][:5,0]}') bar.done() ### train map scaler = StandardScaler() reduction_map = UMAP( n_components=2, metric='euclidean', n_neighbors=10, # 5 10 20 50 min_dist=.01, # .01 .1 .2 .5 .9 random_state=random_state, transform_seed=random_state, # verbose=1, ) dim_reductor = DimReductor( scaler, reduction_map, inter_pca_dims=16, ) x = np.concatenate([days_embeddings[day] for day in days], axis=0) dim_reductor.fit( x, drop_duplicates=True, # normal_std=1e-5, ) ### compute maps days_dim_reductions = {} bar = ProgressBar(len(days)) for day in days: x = days_embeddings[day] new_x = dim_reductor.transform(x) days_dim_reductions[day] = new_x bar(f'day={day:.3f}/{days[-1]:.3f}; x.shape={x.shape}; new_x.shape={new_x.shape}' ) bar.done() results = { 'model_name': train_handler.model.get_name(), 'survey': dataset.survey, 'band_names': dataset.band_names, 'class_names': dataset.class_names, 'days': days, 'days_dim_reductions': days_dim_reductions, 'days_y_true': days_y_true, } ### save file save_filedir = f'{save_rootdir}/{dataset.lcset_name}/id={train_handler.id}.d' files.save_pickle(save_filedir, results) # save file dataset.reset_max_day() # very important!! dataset.calcule_precomputed() # very important!! return
obse_sampler_bdict_full, original_space=0, add_samples=0, save_filedir=f'../save/obse_sampler/{cfilename}/{lcset_name}/00.png' ) plot_obse_samplers( lcset_name, lcset_info, obse_sampler_bdict_full, original_space=1, add_samples=1, save_filedir=f'../save/obse_sampler/{cfilename}/{lcset_name}/11.png' ) save_pickle( f'../save/obse_sampler/{cfilename}/{lcset_name}/obse_sampler_bdict_full.d', obse_sampler_bdict_full) obse_sampler_bdict = along_dict_obj_method(obse_sampler_bdict_full, 'clean') save_pickle( f'../save/obse_sampler/{cfilename}/{lcset_name}/obse_sampler_bdict.d', obse_sampler_bdict) ### generate synth curves sd_kwargs = { 'synthetic_samples_per_curve': _C.SYNTH_SAMPLES_PER_CURVE, 'method': main_args.method, 'sne_specials_df': pd.read_csv(f'../data/{survey}/sne_specials.csv'),
def save_performance(train_handler, data_loader, save_rootdir, target_is_onehot:bool=False, target_y_key='target/y', pred_y_key='model/y', days_n:int=DEFAULT_DAYS_N, **kwargs): train_handler.load_model() # important, refresh to best model train_handler.model.eval() # important, model eval mode dataset = data_loader.dataset # get dataset dataset.reset_max_day() # always reset max day days_rec_metrics_df = DFBuilder() days_class_metrics_df = DFBuilder() days_class_metrics_cdf = {c:DFBuilder() for c in dataset.class_names} days_predictions = {} days_cm = {} days = np.linspace(C_.DEFAULT_MIN_DAY, dataset.max_day, days_n)#[::-1] bar = ProgressBarMulti(len(days), 4) with torch.no_grad(): can_be_in_loop = True for day in days: dataset.set_max_day(day) # very important!! dataset.calcule_precomputed() # very important!! try: if can_be_in_loop: tdicts = [] for ki,in_tdict in enumerate(data_loader): _tdict = train_handler.model(TDictHolder(in_tdict).to(train_handler.device)) tdicts += [_tdict] tdict = minibatch_dict_collate(tdicts) ### mse mse_loss_bdict = {} for kb,b in enumerate(dataset.band_names): p_onehot = tdict[f'input/onehot.{b}'][...,0] # (b,t) #p_rtime = tdict[f'input/rtime.{b}'][...,0] # (b,t) #p_dtime = tdict[f'input/dtime.{b}'][...,0] # (b,t) #p_x = tdict[f'input/x.{b}'] # (b,t,f) p_rerror = tdict[f'target/rerror.{b}'] # (b,t,1) p_rx = tdict[f'target/recx.{b}'] # (b,t,1) p_rx_pred = tdict[f'model/decx.{b}'] # (b,t,1) mse_loss_b = (p_rx-p_rx_pred)**2/(C_.REC_LOSS_EPS+C_.REC_LOSS_K*(p_rerror**2)) # (b,t,1) mse_loss_b = seq_utils.seq_avg_pooling(mse_loss_b, p_onehot)[...,0] # (b,t,1) > (b,t) > (b) mse_loss_bdict[b] = mse_loss_b[...,0] # (b,1) > (b) mse_loss = torch.cat([mse_loss_bdict[b][...,None] for b in dataset.band_names], axis=-1).mean(dim=-1) # (b,d) > (b) mse_loss = mse_loss.mean() days_rec_metrics_df.append(day, { '_day':day, 'mse':tensor_to_numpy(mse_loss), }) ### class prediction y_true = tdict[target_y_key] # (b) #y_pred_p = torch.nn.functional.softmax(tdict[pred_y_key], dim=-1) # (b,c) y_pred_p = torch.sigmoid(tdict[pred_y_key]) # (b,c) #print('y_pred_p',y_pred_p[0]) if target_is_onehot: assert y_pred_.shape==y_true.shape y_true = torch.argmax(y_true, dim=-1) y_true = tensor_to_numpy(y_true) y_pred_p = tensor_to_numpy(y_pred_p) days_predictions[day] = {'y_true':y_true, 'y_pred_p':y_pred_p} metrics_cdict, metrics_dict, cm = fcm.get_multiclass_metrics(y_pred_p, y_true, dataset.class_names) for c in dataset.class_names: days_class_metrics_cdf[c].append(day, update_dicts([{'_day':day}, metrics_cdict[c]])) days_class_metrics_df.append(day, update_dicts([{'_day':day}, metrics_dict])) days_cm[day] = cm ### progress bar recall = {c:metrics_cdict[c]['recall'] for c in dataset.class_names} bmetrics_dict = {k:metrics_dict[k] for k in metrics_dict.keys() if 'b-' in k} bar([f'lcset_name={dataset.lcset_name}; day={day:.3f}', f'mse_loss={mse_loss}', f'bmetrics_dict={bmetrics_dict}', f'recall={recall}']) except KeyboardInterrupt: can_be_in_loop = False bar.done() d = { 'model_name':train_handler.model.get_name(), 'survey':dataset.survey, 'band_names':dataset.band_names, 'class_names':dataset.class_names, 'lcobj_names':dataset.get_lcobj_names(), 'days':days, 'days_rec_metrics_df':days_rec_metrics_df.get_df(), 'days_predictions':days_predictions, 'days_class_metrics_df':days_class_metrics_df.get_df(), 'days_class_metrics_cdf':{c:days_class_metrics_cdf[c].get_df() for c in dataset.class_names}, 'days_cm':days_cm, } ### save file save_filedir = f'{save_rootdir}/{dataset.lcset_name}/id={train_handler.id}.d' files.save_pickle(save_filedir, d) # save file dataset.reset_max_day() # very important!! dataset.calcule_precomputed() # very important!! return
] if main_args.setn == '.' else main_args.setn setns = [setns] if isinstance(setns, str) else setns new_lcdataset = lcdataset.copy() # copy with all original lcsets for setn in setns: for kf in kfs: lcset_name = f'{kf}@{setn}' lcset = new_lcdataset[lcset_name] synth_rootdir = f'../save/ssne/{main_args.method}/{cfilename}/{lcset_name}' print('synth_rootdir=', synth_rootdir) synth_lcset = lcset.copy({}) # copy filedirs = fcfiles.get_filedirs(synth_rootdir, fext='ssne') bar = ProgressBar(len(filedirs)) for filedir in filedirs: d = fcfiles.load_pickle(filedir) lcobj_name = d['lcobj_name'] bar(f'lcset_name={lcset_name} - lcobj_name={lcobj_name}') for k, new_lcobj in enumerate(d['new_lcobjs']): synth_lcset.set_lcobj(f'{lcobj_name}.{k+1}', new_lcobj) bar.done() synth_lcset.reset() new_lcset_name = f'{lcset_name}.{main_args.method}' new_lcdataset.set_lcset(new_lcset_name, synth_lcset) save_rootdir = f'{rootdir}' save_filedir = f'{save_rootdir}/{cfilename}~method={main_args.method}.{_C.EXT_SPLIT_LIGHTCURVE}' save_pickle(save_filedir, new_lcdataset)
train_df_x = pd.concat([train_df_x_r] * k, axis='rows') train_df_y = pd.concat([train_df_y_r] * k, axis='rows') if train_config == 's': # train_df_x_s, train_df_y_s = load_features(f'../save/fats/{cfilename}/{main_args.kf}@train.{main_args.method}.df', main_args.mode) k = 1 # 1 2 train_df_x = pd.concat([train_df_x_s] * k, axis='rows') train_df_y = pd.concat([train_df_y_s] * k, axis='rows') if train_config == 'r+s': # train_df_x_s, train_df_y_s = load_features(f'../save/fats/{cfilename}/{main_args.kf}@train.{main_args.method}.df', main_args.mode) # s_repeats = len(train_df_x_s)//len(train_df_x_r) train_df_x = pd.concat([train_df_x_r] * s_repeats + [train_df_x_s], axis='rows') train_df_y = pd.concat([train_df_y_r] * s_repeats + [train_df_y_s], axis='rows') fit_kwargs = {} features = list(train_df_x.columns) val_df_x, val_df_y = load_features( f'../save/fats/{cfilename}/{main_args.kf}@val.df', main_args.mode) brf_d = train_classifier(train_df_x, train_df_y, val_df_x, val_df_y, lcset_info, **fit_kwargs) d = evaluate_classifier( brf_d, f'../save/fats/{cfilename}/{main_args.kf}@test.df', main_args.mode, lcset_info, **fit_kwargs) save_rootdir = f'../save' save_filedir = f'{save_rootdir}/exp=rf_eval~train_config={train_config}~mode={main_args.mode}/{cfilename}/{main_args.kf}@test/id={main_args.mid}.d' save_pickle(save_filedir, d)
def export_dictionary( self, description: str, save_folder: str, band_names: list = None, filename_extra_parameters: dict = {}, npartitions: int = C_.N_JOBS, any_band_points=C_.MIN_POINTS_LIGHTCURVE_SURVEY_EXPORT, outliers_df=None, ): class_dfkey = self.df_index_names['label'] band_dfkey = self.df_index_names['band'] ### separate bands for optimal band_names = list( self.band_dictionary.keys()) if band_names is None else band_names print(f'band_names={band_names}') ### clean dataframe to speed up thing in the objects search detections_df = self.detections_df.reset_index() print(f'cleaning the DataFrame - samples={len(detections_df):,}') #print('detections_df',detections_df[detections_df[self.df_index_names['oid']]=='ZTF17aabwgdw']) detections_ddf = dd.from_pandas(detections_df, npartitions=npartitions) detections_df = detections_ddf.loc[detections_ddf[ self.df_index_names['band']].isin( [self.band_dictionary[b] for b in band_names])].compute() print(f'remove_invalid_bands > samples={len(detections_df):,}') detections_ddf = dd.from_pandas(detections_df, npartitions=npartitions) detections_df = detections_ddf.loc[ detections_ddf[self.df_index_names['oid']].isin( list(set(self.labels_df.index)))].compute() print(f'remove_invalid_classes > samples={len(detections_df):,}') detections_ddf = dd.from_pandas(detections_df, npartitions=npartitions) detections_df = detections_ddf.loc[ detections_ddf[self.df_index_names['obs']] > 0].compute() print(f'remove_negative_obs > samples={len(detections_df):,}') detections_df = detections_df.set_index(self.df_index_names['oid']) ### prepare dataset lcset = dsc.LCSet( {}, self.survey_name, description, band_names, self.class_names, True, ) lcdataset = dsc.LCDataset() lcdataset.set_lcset('outliers', lcset.copy()) lcdataset.set_lcset('faint', lcset.copy()) lcdataset.set_lcset('raw', lcset.copy()) ### get filename filename_parameters = { 'survey': self.survey_name, 'bands': ''.join(band_names), } filename_parameters.update(filename_extra_parameters) save_filedir = f'{save_folder}/{self.get_dict_name(filename_parameters)}.{C_.EXT_RAW_LIGHTCURVE}' print(f'save_filedir={save_filedir}') ### easy variables outliers = [] if outliers_df is None else list( outliers_df['outliers'].values) easy_label_dict = { self.class_to_label_dict[c]: kc for kc, c in enumerate(self.class_names) } print(f'easy_label_dict={easy_label_dict}') # start loop correct_samples = 0 detections_ddf = dd.from_pandas(detections_df, npartitions=npartitions) lcobj_names = sorted(list(set(detections_df.index))) bar = ProgressBar(len(lcobj_names)) for k, lcobj_name in enumerate(lcobj_names): try: lcobj = lcc.LCO() ### get detections obj_df = detections_ddf.loc[lcobj_name].compute() # FAST for kb, b in enumerate(band_names): band_object_df = obj_df[obj_df[band_dfkey] == self.band_dictionary[b]] original_lc = band_object_df[[ self.df_index_names['obs_day'], self.df_index_names['obs'], self.df_index_names['obs_error'] ]].values band_lc_flux = self.get_band(original_lc) lcobj.add_b(b, band_lc_flux[:, 0], band_lc_flux[:, 1], band_lc_flux[:, 2]) lcobj.clean_small_cadence() lcobj.reset_day_offset_serial() ### get label y = self.get_label(self.labels_df, lcobj_name, easy_label_dict) lcobj.set_y(y) ### check lengths if lcobj.any_band_eqover_length(any_band_points): ra, dec = self.get_radec(self.labels_df, lcobj_name) lcobj.ra = ra lcobj.dec = dec lcset_name = 'raw' if lcobj_name in outliers: lcset_name = 'outliers' elif lcobj.get_snr() < C_.MIN_SNR: lcset_name = 'faint' lcdataset[lcset_name].set_lcobj(lcobj_name, lcobj) correct_samples += 1 else: pass #print(lcobj_name) bar(f'obj={lcobj_name} - y={y} - c={self.class_names[y]} - lengths_bdict={lcobj.get_length_bdict()} - correct_samples (any-band>={any_band_points})={correct_samples:,}' ) except KeyboardInterrupt: bar.done() print('stopped!') break bar.done() save_pickle(save_filedir, lcdataset) return lcdataset
def save_attnstats(train_handler, data_loader, save_rootdir, eps: float = C_.EPS, djs=[2, 3], **kwargs): train_handler.load_model() # important, refresh to best model train_handler.model.eval() # important, model eval mode dataset = data_loader.dataset # get dataset is_parallel = 'Parallel' in train_handler.model.get_name() if not is_parallel: return attn_scores_collection = {b: [] for kb, b in enumerate(dataset.band_names)} with torch.no_grad(): tdicts = [] for ki, in_tdict in enumerate(data_loader): train_handler.model.autoencoder['encoder'].add_extra_return = True _tdict = train_handler.model( TDictHolder(in_tdict).to(train_handler.device)) train_handler.model.autoencoder['encoder'].add_extra_return = False tdicts += [_tdict] tdict = minibatch_dict_collate(tdicts) for kb, b in enumerate(dataset.band_names): p_onehot = tdict[f'input/onehot.{b}'][..., 0] # (b,t) #p_rtime = tdict[f'input/rtime.{b}'][...,0] # (b,t) #p_dtime = tdict[f'input/dtime.{b}'][...,0] # (b,t) #p_x = tdict[f'input/x.{b}'] # (b,t,f) #p_rerror = tdict[f'target/rerror.{b}'] # (b,t,1) #p_rx = tdict[f'target/recx.{b}'] # (b,t,1) # print(tdict.keys()) uses_attn = any([f'attn_scores' in k for k in tdict.keys()]) if not uses_attn: return ### attn scores attn_scores = tdict[f'model/attn_scores/encz.{b}'] # (b,h,qt) attn_scores_mean = attn_scores.mean( dim=1 )[..., None] # (b,h,qt)>(b,qt,1) # mean along heads. it is not a distributions as it can sum!=1 attn_scores_min_max = seq_utils.seq_min_max_norm( attn_scores_mean, p_onehot) # (b,qt,1) ### stats lcobj_names = dataset.get_lcobj_names() bar = ProgressBar(len(lcobj_names)) for k, lcobj_name in enumerate(lcobj_names): lcobj = dataset.lcset[lcobj_name] lcobjb = lcobj.get_b(b) # complete p_onehot_k = tensor_to_numpy(p_onehot[k]) # (b,t) > (t) b_len = p_onehot_k.sum() assert b_len <= len(lcobjb), f'{b_len}<={len(lcobjb)}' if b_len <= min(djs): continue attn_scores_k = tensor_to_numpy( attn_scores_mean[k, :b_len, 0]) # (b,qt,1)>(t) attn_scores_min_max_k = tensor_to_numpy( attn_scores_min_max[k, :b_len, 0]) # (b,qt,1)>(t) attn_entropy_h = tensor_to_numpy( torch.sum(-attn_scores[k, :, :b_len] * torch.log(attn_scores[k, :, :b_len] + 1e-10), dim=1)) # (b,h,qt)>(h) attn_scores_mean_distr = torch.softmax( attn_scores_mean[k, :b_len, 0], dim=0) # (b,qt,1)>(qt) attn_entropy = tensor_to_numpy( torch.sum(-attn_scores_mean_distr * torch.log(attn_scores_mean_distr + 1e-10), dim=0)) # (qt)>() days = lcobjb.days[:b_len] # (t) obs = lcobjb.obs[:b_len] # (t) obse = lcobjb.obse[:b_len] # (t) snr = lcobjb.get_snr(max_len=b_len) max_obs = np.max(obs) peak_day = days[np.argmax(obs)] duration = days[-1] - days[0] obs_min_max = min_max_norm(obs) # (t) obse_min_max = min_max_norm(obse) # (t) bar(f'b={b}; lcobj_name={lcobj_name}; b_len={b_len}; attn_entropy_h={attn_entropy_h}; attn_entropy={attn_entropy}; snr={snr}; max_obs={max_obs}' ) lc_features = [] for j in range(min(djs), b_len): # dj,dj+1,...,b_len-1 j_features = { f'j': j, f'attn_scores_k.j': attn_scores_k[j], f'attn_scores_min_max_k.j': attn_scores_min_max_k[j], f'days.j': days[j], f'obs.j': obs[j], f'obs_min_max.j': obs_min_max[j], f'obse.j': obse[j], f'obse_min_max.j': obse_min_max[j], } for dj in djs: local_slope_m, local_slope_n, sub_days, sub_obs = get_local_slope( days, obs, j, dj) j_features.update({ f'local_slope_m.j~dj={dj}': local_slope_m, f'local_slope_n.j~dj={dj}': local_slope_n, f'peak_distance.j~dj={dj}~mode=local': days[j] - peak_day, f'peak_distance.j~dj={dj}~mode=mean': np.mean(sub_days) - peak_day, f'peak_distance.j~dj={dj}~mode=median': np.median(sub_days) - peak_day, }) lc_features += [j_features] attn_scores_collection[b] += [{ #'lcobj_name':lcobj_name, f'c': dataset.class_names[lcobj.y], f'b_len': b_len, f'peak_day': peak_day, f'duration': duration, f'attn_entropy_h': attn_entropy_h, f'attn_entropy': attn_entropy, f'snr': snr, f'max_obs': max_obs, f'lc_features': lc_features, }] bar.done() results = { 'model_name': train_handler.model.get_name(), 'survey': dataset.survey, 'band_names': dataset.band_names, 'class_names': dataset.class_names, 'max_day': dataset.max_day, 'attn_scores_collection': attn_scores_collection, } ### save file save_filedir = f'{save_rootdir}/{dataset.lcset_name}/id={train_handler.id}.d' files.save_pickle(save_filedir, results) # save file dataset.reset_max_day() # very important!! dataset.calcule_precomputed() return