Example #1
0
def build_meta_and_matching_feat_dfs(feat, fname, meta, align_bluelight=True):
    from tierpsytools.read_data.hydra_metadata import \
        build_matching_feat_meta, align_bluelight_conditions

    feat, meta = build_matching_feat_meta(feat,
                                          fname,
                                          meta,
                                          add_bluelight=True)

    if align_bluelight:
        feat, meta = align_bluelight_conditions(feat, meta, how='outer')
    print('1: ', feat.shape)
    return feat, meta
Example #2
0
def read_disease_data(feat_file, fname_file, metadata_file, drop_nans=True, export_nan_worms=False):
    """

    Parameters
    ----------
    feat_file : TYPE
        DESCRIPTION.
    fname_file : TYPE
        DESCRIPTION.
    metadata_file : TYPE
        DESCRIPTION.
    export_nan_worms : TYPE, optional
        DESCRIPTION. The default is False.

    Returns
    -------
    feat : TYPE
        DESCRIPTION.
    meta : TYPE
        DESCRIPTION.

    """
    # feat = pd.read_csv(feat_file,
    #                    comment='#')
    # fname = pd.read_csv(fname_file,
    #                     comment='#')
    # meta = pd.read_csv(metadata_file, index_col=None)
    # meta['imaging_date_yyymmdd'] = pd.to_datetime(meta.imaging_date_yyyymmdd,
    #                                               format='%Y%m%d').dt.date
    
    # assert meta.worm_strain.unique().shape[0] == meta.worm_gene.unique().shape[0]

    feat, meta = read_hydra_metadata(feat_file,
                                     fname_file,
                                     metadata_file)
    meta['imaging_date_yyyymmdd'] = pd.to_datetime(meta.imaging_date_yyyymmdd,
                                                  format='%Y%m%d').dt.date
    
    assert meta.worm_strain.unique().shape[0] == meta.worm_gene.unique().shape[0]
    
    feat, meta = align_bluelight_conditions(feat,
                                            meta,
                                            how='inner') #removes wells that don't have all 3 conditions
    if drop_nans:
        feat, meta = drop_nan_worms(feat, meta, saveto=feat_file.parent)

    return feat, meta
Example #3
0
    fname_files.sort(key=find_window)

    assert (find_window(f[0]) == find_window(f[1]) for f in list(zip(feat_files, fname_files)))

    feat_df = []
    meta_df = []
    for c,f in enumerate(list(zip(feat_files, fname_files))):
        _feat = pd.read_csv(f[0],
                            comment='#')
        _fname = pd.read_csv(f[1],
                             comment='#')
    
        _feat, _meta = read_hydra_metadata(_feat, _fname, meta)
    
        _feat, _meta = align_bluelight_conditions(_feat,
                                                  _meta,
                                                  how='inner')
        _meta['window'] = find_window(f[0])
        meta_df.append(_meta)
        feat_df.append(_feat)
    
    assert pd.concat(meta_df).shape[0] == pd.concat(feat_df).shape[0]

    meta = pd.concat(meta_df)
    meta.reset_index(drop=True, inplace=True)
    feat = pd.concat(feat_df)
    feat.reset_index(drop=True, inplace=True)
    
    #%% wells to check
    nan_worms = meta[meta.worm_gene.isna()][['featuresN_filename',
                                             'well_name',
    # add well annotations to metadata
    annotated_metadata_path = Path(
        str(metadata_path).replace('.csv', '_annotated.csv'))
    if not annotated_metadata_path.exists():
        metadata_df = update_metadata_with_wells_annotations(
            Path(args.aux_dir), saveto=annotated_metadata_path)

    # read metadata + features summaries
    features_df, metadata_df = read_hydra_metadata(
        feat_file=args.features_file,
        fname_file=args.filenames_file,
        meta_file=annotated_metadata_path)

    # align bluelight conditions (as separate feature columns)
    features_df, metadata_df = align_bluelight_conditions(
        features_df,
        metadata_df,
        merge_on_cols=['date_yyyymmdd', 'imaging_plate_id', 'well_name'])

    ### clean data

    # remove rows with missing strain information (n=10)
    metadata_df = metadata_df[~metadata_df[args.strain_colname].isna()]
    features_df = features_df.reindex(metadata_df.index)

    # subset for Tierpsy features only
    if args.n_features is not None:
        features_df = select_feat_set(features_df,
                                      tierpsy_set_name='tierpsy_{}'.format(
                                          args.n_features),
                                      append_bluelight=True)
moa_file = '/Users/em812/Data/Drugs/StrainScreens/AllCompoundsMoA.csv'

bad_well_cols = [
    'is_bad_well_from_gui', 'is_bad_well_misplaced_plate',
    'is_bad_well_ledfailure'
]
#%% Read data
feat = pd.read_csv(feat_file, comment='#')
fname = pd.read_csv(fname_file, comment='#')

meta = pd.read_csv(metadata_file, index_col=None)
meta.loc[meta['drug_type'].isna(), 'drug_type'] = 'NoCompound'

# Match metadata to feature summaries
feat, meta = read_hydra_metadata(feat, fname, meta)
feat, meta = align_bluelight_conditions(feat, meta, how='outer')

del feat

meta_colnames = list(meta.columns)
print(meta_colnames)
#%% Choose the videos
# Keep only N2s
meta = meta[meta['worm_strain'] == 'N2']

# Remove wells missing bluelight conditions
imgstore_cols = [col for col in meta.columns if 'imgstore_name' in col]
miss = meta[imgstore_cols].isna().any(axis=1)
meta = meta.loc[~miss, :]

# Remove bad wells
Example #6
0
def process_feature_summaries(metadata_path,
                              results_dir,
                              compile_day_summaries=True,
                              imaging_dates=None,
                              align_bluelight=True,
                              window_summaries=False,
                              n_wells=96):
    """ Compile feature summary results and join with metadata to produce
        combined full feature summary results
        
        Parameters
        ----------
        metadata : pd.DataFrame
            Experiment metadata
        results_dir : str, Path
            Path to 'Results' directory, containing Tierpsy feature summaries files
        compile_day_summaries : bool
            Compile from Tierpsy feature summaries for each experiment day
        imaging_dates : list of str, None
            List of imaging dates to compile Tierspy feature summaries from. If None, will use 
            'date_yyyymmdd' column of metadata
        align_bluelight : bool
            Align bluelight conditions (convert to wide format)
        window_summaries : bool
            Compile from windowed features summaries files
        
        Returns
        -------
        features, metadata
        
    """

    from tierpsytools.read_data.compile_features_summaries import compile_tierpsy_summaries
    from tierpsytools.read_data.hydra_metadata import read_hydra_metadata, align_bluelight_conditions
    from preprocessing.compile_window_summaries import find_window_summaries, compile_window_summaries

    combined_feats_path = Path(results_dir) / ("full_features.csv"
                                               if not window_summaries else
                                               "full_window_features.csv")
    combined_fnames_path = Path(results_dir) / ("full_filenames.csv"
                                                if not window_summaries else
                                                "full_window_filenames.csv")

    if np.logical_and(combined_feats_path.is_file(),
                      combined_fnames_path.is_file()):
        print("Found existing full feature summaries")
    else:
        print("Compiling feature summary results")
        if window_summaries:
            print("\nFinding window summaries files..")
            fname_files, feat_files = find_window_summaries(
                results_dir=results_dir, dates=imaging_dates)

            # compile window summaries files
            print("\nCompiling window summaries..")
            compiled_filenames, compiled_features = compile_window_summaries(
                fname_files=fname_files,
                feat_files=feat_files,
                compiled_fnames_path=combined_fnames_path,
                compiled_feats_path=combined_feats_path,
                results_dir=Path(results_dir),
                window_list=None,
                n_wells=n_wells)
        else:
            if compile_day_summaries:
                if imaging_dates is not None:
                    assert type(imaging_dates) == list
                    feat_files = []
                    fname_files = []
                    for date in imaging_dates:
                        date_dir = Path(results_dir) / date
                        feat_files.extend(
                            list(
                                Path(date_dir).rglob('features_summary*.csv')))
                        fname_files.extend(
                            list(
                                Path(date_dir).rglob(
                                    'filenames_summary*.csv')))
                else:
                    feat_files = list(
                        Path(results_dir).rglob('features_summary*.csv'))
                    fname_files = [
                        Path(str(f).replace("/features_", "/filenames_"))
                        for f in feat_files
                    ]
            else:
                feat_files = list(
                    Path(results_dir).glob('features_summary*.csv'))
                fname_files = list(
                    Path(results_dir).glob('filenames_summary*.csv'))

            # Keep only features files for which matching filenames_summaries exist
            feat_files = [
                ft for ft, fn in zip(np.unique(feat_files),
                                     np.unique(fname_files)) if fn is not None
            ]
            fname_files = [
                fn for fn in np.unique(fname_files) if fn is not None
            ]

            feat_files = [ft for ft in feat_files if not 'window' in str(ft)]
            fname_files = [fn for fn in fname_files if not 'window' in str(fn)]

            # Compile feature summaries for matched features/filename summaries
            compile_tierpsy_summaries(feat_files=feat_files,
                                      fname_files=fname_files,
                                      compiled_feat_file=combined_feats_path,
                                      compiled_fname_file=combined_fnames_path)

    # Read metadata + record column order
    metadata = pd.read_csv(metadata_path,
                           dtype={
                               "comments": str,
                               "source_plate_id": str,
                               "imaging_run_number": str
                           })
    meta_col_order = metadata.columns.tolist()

    feat_id_cols = ['file_id', 'n_skeletons', 'well_name', 'is_good_well']

    # if there are no well annotations in metadata, omit 'is_good_well' from feat_id_cols
    if 'is_good_well' not in meta_col_order:
        feat_id_cols = [f for f in feat_id_cols if f != 'is_good_well']
    if window_summaries:
        feat_id_cols.append('window')

    # Read features summaries + metadata and add bluelight column if aligning bluelight video results
    features, metadata = read_hydra_metadata(combined_feats_path,
                                             combined_fnames_path,
                                             metadata_path,
                                             feat_id_cols=feat_id_cols,
                                             add_bluelight=align_bluelight)

    if align_bluelight:
        features, metadata = align_bluelight_conditions(
            feat=features,
            meta=metadata,
            how='outer',
            merge_on_cols=[
                'date_yyyymmdd', 'imaging_run_number', 'imaging_plate_id',
                'well_name'
            ])
        meta_col_order.remove('imgstore_name')

    assert set(features.index) == set(metadata.index)

    # record new columns
    assert len(set(meta_col_order) - set(metadata.columns)
               ) == 0  # ensure no old columns were dropped
    new_cols = list(set(metadata.columns) - set(meta_col_order))
    meta_col_order.extend(new_cols)

    return features, metadata[meta_col_order]
    #check files in summary vs files in metadata
    # assert len(set(fname.filename.unique()) - set(meta.imgstore_name.unique())) == 0

    imgstore_fname = [
        '/'.join(r.filename.split('/')[-3:-1]) for i, r in fname.iterrows()
    ]
    missing_files = set(imgstore_fname) - set(meta.imgstore_name.unique())

    # missing files are from 20200626

    #%%
    feat, meta = read_hydra_metadata(feat, fname, meta)

    feat, meta = align_bluelight_conditions(
        feat, meta,
        how='inner')  #removes wells that don't have all 3 conditions

    #%% wells to check
    nan_worms = meta[meta.worm_gene.isna()][[
        'featuresN_filename', 'well_name', 'imaging_plate_id',
        'instrument_name', 'imaging_date_yyyymmdd'
    ]]
    # nan_worms.to_csv(FEAT_FILE.parent / 'nan_worms.csv',
    #                   index=False)

    feat = feat.drop(index=nan_worms.index)
    meta = meta.drop(index=nan_worms.index)

    #%%
    # strain sets
Example #8
0
moa_file = root / 'analysis' / 'AllCompoundsMoA.csv'

bad_well_cols = [
    'is_bad_well_from_gui', 'is_bad_well_misplaced_plate',
    'is_bad_well_ledfailure'
]

#%% Read data
feat = pd.read_csv(feat_file, comment='#')
fname = pd.read_csv(fname_file, comment='#')

meta = pd.read_csv(metadata_file, index_col=None)

# %% Match metaddata and features
feat, meta = read_hydra_metadata(feat, fname, meta)
feat, meta = align_bluelight_conditions(feat, meta)

# %% Add moa info
moa = pd.read_csv(moa_file, index_col=None)
moa = moa.rename(columns={"CSN": "drug_type"})
meta = pd.merge(meta,
                moa[['MOA_general', 'MOA_specific', 'drug_type', 'MOA_group']],
                on='drug_type',
                how='left')

# %% Preprocess (optional)
# Remove bad wells
bad = meta[bad_well_cols].any(axis=1)

feat = feat.loc[~bad, :]
meta = meta.loc[~bad, :]
Example #9
0
meta_file = Path().cwd() / 'sample_data' / 'metadata_dataframe.csv'

#%% Read data
# If the input files are the features_summaries, the filenames_summaries and
# metadata files, then you can use this function to make your matching feat
# and meta dataframes:
# feat, meta = read_hydra_metadata(feat_file, fname_file, meta_file)

# For the synthetic data, I just need to read the dataframes:
feat = pd.read_csv(feat_file)
meta = pd.read_csv(meta_file)

# Align the bluelight conditions (one row per well, wide format)
feat, meta = align_bluelight_conditions(
    feat,
    meta,
    bluelight_specific_meta_cols=['imgstore_name', 'n_skeletons'],
    merge_on_cols=['date_yyyymmdd', 'imaging_plate_id', 'well_name'])

#%% Filter data
# Filter rows based on n_skeletons
feat, meta = filter_n_skeletons(feat,
                                meta,
                                min_nskel_per_video=2000,
                                min_nskel_sum=None)

# Filter rows based on percentage of nan values
feat = filter_nan_inf(feat, 0.2, 1)
meta = meta.loc[feat.index]

# Filter features based on percentage of nan values