def load_and_train(dset, outcome_def, out_dir, feat_names=None, use_processed=True): df = pd.read_pickle(f'../data/tracks/tracks_{dset}.pkl') if dset == 'clath_aux_dynamin': df = df[df.catIdx.isin([1, 2])] df = df[df.lifetime > 15] else: df = df[df['valid'] == 1] df = features.add_basic_features(df) df = log_transforms(df) df = add_sig_mean(df) df_train = df[df.cell_num.isin(config.DSETS[dset]['train'])] df_test = df[df.cell_num.isin(config.DSETS[dset]['test'])] df_train = df_train.dropna() #outcome_def = 'Z_sig_mean' #out_dir = 'results/regression/Sep15' os.makedirs(out_dir, exist_ok=True) if not feat_names: feat_names = data.get_feature_names(df_train) feat_names = [ x for x in feat_names if not x.startswith('sc_') and not x.startswith('nmf_') and not x in [ 'center_max', 'left_max', 'right_max', 'up_max', 'down_max', 'X_max_around_Y_peak', 'X_max_after_Y_peak' ] and not x.startswith('pc_') and not 'log' in x and not 'binary' in x # and not 'slope' in x ] for model_type in tqdm(['linear', 'gb', 'rf', 'svm', 'ridge']): out_name = f'{model_type}' #print(out_name) if use_processed and os.path.exists(f'{out_dir}/{out_name}.pkl'): continue train_reg(df_train, feat_names=feat_names, model_type=model_type, outcome_def=outcome_def, out_name=f'{out_dir}/{out_name}.pkl')
def get_data(dset='clath_aux+gak_a7d2', use_processed=True, save_processed=True, processed_file=oj(config.DIR_PROCESSED, 'df.pkl'), metadata_file=oj(config.DIR_PROCESSED, 'metadata.pkl'), use_processed_dicts=True, compute_dictionary_learning=False, outcome_def='y_consec_thresh', pixel_data: bool = False, video_data: bool = True, acc_thresh=0.95, previous_meta_file: str = None): ''' Params ------ use_processed: bool, optional determines whether to load df from cached pkl save_processed: bool, optional if not using processed, determines whether to save the df use_processed_dicts: bool, optional if False, recalculate the dictionary features previous_meta_file: str, optional filename for metadata.pkl file saved by previous preprocessing the thresholds for lifetime are taken from this file ''' # get things based onn dset DSET = config.DSETS[dset] LABELS = config.LABELS[dset] processed_file = processed_file[:-4] + '_' + dset + '.pkl' metadata_file = metadata_file[:-4] + '_' + dset + '.pkl' if use_processed and os.path.exists(processed_file): return pd.read_pickle(processed_file) else: print('loading + preprocessing data...') metadata = {} # load tracks print('\tloading tracks...') df = load_tracking.get_tracks( data_dir=DSET['data_dir'], split=DSET, pixel_data=pixel_data, video_data=video_data, dset=dset) # note: different Xs can be different shapes # df = df.fillna(df.median()) # this only does anything for the dynamin tracks, where x_pos is sometimes NaN # print('num nans', df.isna().sum()) df['pid'] = np.arange(df.shape[0]) # assign each track a unique id df['valid'] = True # all tracks start as valid # set testing tracks to not valid if DSET['test'] is not None: df['valid'][df.cell_num.isin(DSET['test'])] = False metadata['num_tracks'] = df.valid.sum() # print('training', df.valid.sum()) # preprocess data print('\tpreprocessing data...') df = remove_invalid_tracks(df) # use catIdx # print('valid', df.valid.sum()) df = features.add_basic_features(df) df = outcomes.add_outcomes(df, LABELS=LABELS) metadata['num_tracks_valid'] = df.valid.sum() metadata['num_aux_pos_valid'] = df[df.valid][outcome_def].sum() metadata['num_hotspots_valid'] = df[df.valid]['hotspots'].sum() df['valid'][df.hotspots] = False df, meta_lifetime = process_tracks_by_lifetime( df, outcome_def=outcome_def, plot=False, acc_thresh=acc_thresh, previous_meta_file=previous_meta_file) df['valid'][df.short] = False df['valid'][df.long] = False metadata.update(meta_lifetime) metadata['num_tracks_hard'] = df['valid'].sum() metadata['num_aux_pos_hard'] = int( df[df.valid == 1][outcome_def].sum()) # add features print('\tadding features...') df = features.add_dasc_features(df) if compute_dictionary_learning: df = features.add_dict_features(df, use_processed=use_processed_dicts) # df = features.add_smoothed_tracks(df) # df = features.add_pcs(df) # df = features.add_trend_filtering(df) # df = features.add_binary_features(df, outcome_def=outcome_def) if save_processed: print('\tsaving...') pkl.dump(metadata, open(metadata_file, 'wb')) df.to_pickle(processed_file) return df