def load_and_train(dset,
                   outcome_def,
                   out_dir,
                   feat_names=None,
                   use_processed=True):

    df = pd.read_pickle(f'../data/tracks/tracks_{dset}.pkl')
    if dset == 'clath_aux_dynamin':
        df = df[df.catIdx.isin([1, 2])]
        df = df[df.lifetime > 15]
    else:
        df = df[df['valid'] == 1]
    df = features.add_basic_features(df)
    df = log_transforms(df)
    df = add_sig_mean(df)
    df_train = df[df.cell_num.isin(config.DSETS[dset]['train'])]
    df_test = df[df.cell_num.isin(config.DSETS[dset]['test'])]
    df_train = df_train.dropna()

    #outcome_def = 'Z_sig_mean'
    #out_dir = 'results/regression/Sep15'
    os.makedirs(out_dir, exist_ok=True)
    if not feat_names:
        feat_names = data.get_feature_names(df_train)
        feat_names = [
            x for x in feat_names if not x.startswith('sc_')
            and not x.startswith('nmf_') and not x in [
                'center_max', 'left_max', 'right_max', 'up_max', 'down_max',
                'X_max_around_Y_peak', 'X_max_after_Y_peak'
            ] and not x.startswith('pc_') and not 'log' in x
            and not 'binary' in x
            #               and not 'slope' in x
        ]
    for model_type in tqdm(['linear', 'gb', 'rf', 'svm', 'ridge']):
        out_name = f'{model_type}'
        #print(out_name)
        if use_processed and os.path.exists(f'{out_dir}/{out_name}.pkl'):
            continue
        train_reg(df_train,
                  feat_names=feat_names,
                  model_type=model_type,
                  outcome_def=outcome_def,
                  out_name=f'{out_dir}/{out_name}.pkl')
Beispiel #2
0
def get_data(dset='clath_aux+gak_a7d2',
             use_processed=True,
             save_processed=True,
             processed_file=oj(config.DIR_PROCESSED, 'df.pkl'),
             metadata_file=oj(config.DIR_PROCESSED, 'metadata.pkl'),
             use_processed_dicts=True,
             compute_dictionary_learning=False,
             outcome_def='y_consec_thresh',
             pixel_data: bool = False,
             video_data: bool = True,
             acc_thresh=0.95,
             previous_meta_file: str = None):
    '''
    Params
    ------
    use_processed: bool, optional
        determines whether to load df from cached pkl
    save_processed: bool, optional
        if not using processed, determines whether to save the df
    use_processed_dicts: bool, optional
        if False, recalculate the dictionary features
    previous_meta_file: str, optional
        filename for metadata.pkl file saved by previous preprocessing
        the thresholds for lifetime are taken from this file
    '''
    # get things based onn dset
    DSET = config.DSETS[dset]
    LABELS = config.LABELS[dset]

    processed_file = processed_file[:-4] + '_' + dset + '.pkl'
    metadata_file = metadata_file[:-4] + '_' + dset + '.pkl'

    if use_processed and os.path.exists(processed_file):
        return pd.read_pickle(processed_file)
    else:
        print('loading + preprocessing data...')
        metadata = {}

        # load tracks
        print('\tloading tracks...')
        df = load_tracking.get_tracks(
            data_dir=DSET['data_dir'],
            split=DSET,
            pixel_data=pixel_data,
            video_data=video_data,
            dset=dset)  # note: different Xs can be different shapes
        #         df = df.fillna(df.median()) # this only does anything for the dynamin tracks, where x_pos is sometimes NaN
        #         print('num nans', df.isna().sum())
        df['pid'] = np.arange(df.shape[0])  # assign each track a unique id
        df['valid'] = True  # all tracks start as valid

        # set testing tracks to not valid
        if DSET['test'] is not None:
            df['valid'][df.cell_num.isin(DSET['test'])] = False
        metadata['num_tracks'] = df.valid.sum()
        # print('training', df.valid.sum())

        # preprocess data
        print('\tpreprocessing data...')
        df = remove_invalid_tracks(df)  # use catIdx
        # print('valid', df.valid.sum())
        df = features.add_basic_features(df)
        df = outcomes.add_outcomes(df, LABELS=LABELS)

        metadata['num_tracks_valid'] = df.valid.sum()
        metadata['num_aux_pos_valid'] = df[df.valid][outcome_def].sum()
        metadata['num_hotspots_valid'] = df[df.valid]['hotspots'].sum()
        df['valid'][df.hotspots] = False
        df, meta_lifetime = process_tracks_by_lifetime(
            df,
            outcome_def=outcome_def,
            plot=False,
            acc_thresh=acc_thresh,
            previous_meta_file=previous_meta_file)
        df['valid'][df.short] = False
        df['valid'][df.long] = False
        metadata.update(meta_lifetime)
        metadata['num_tracks_hard'] = df['valid'].sum()
        metadata['num_aux_pos_hard'] = int(
            df[df.valid == 1][outcome_def].sum())

        # add features
        print('\tadding features...')
        df = features.add_dasc_features(df)
        if compute_dictionary_learning:
            df = features.add_dict_features(df,
                                            use_processed=use_processed_dicts)
        # df = features.add_smoothed_tracks(df)
        # df = features.add_pcs(df)
        # df = features.add_trend_filtering(df)
        # df = features.add_binary_features(df, outcome_def=outcome_def)
        if save_processed:
            print('\tsaving...')
            pkl.dump(metadata, open(metadata_file, 'wb'))
            df.to_pickle(processed_file)
    return df