def distribute_struct_scores_bonus( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/manifest_20201007_tg.csv", dataset_name="struct_scores_bonus", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/README.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["result_image_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def main(): try: args = Args() # Create dataset ds = Dataset(dataset=args.dataset_path, name=args.dataset_name, package_owner=args.package_owner, readme_path=args.readme_path) # Handle optional provided if args.usage_doc_or_link: ds.add_usage_doc(args.usage_doc_or_link) if args.license_doc_or_link: ds.add_license(args.license_doc_or_link) if args.metadata_columns: ds.set_metadata_columns(args.metadata_columns) if args.path_columns: ds.set_path_columns(args.path_columns) # Distribute pkg = ds.distribute(push_uri=args.push_uri, message=args.message) log.info( f"Completed distribution. " f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]" ) except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def extra_additions_dataset(example_frame, example_readme): ds = Dataset(example_frame, "test_dataset", "me", example_readme) ds.set_path_columns(["2dReadPath"]) ds.set_extra_files([example_readme]) ds.set_column_names_map({"2dReadPath": "MappedPath"}) ds.set_metadata_columns(["Structure"]) return ds
def distribute_seg_dataset( test=False, csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv", col_name_map={ "fov_path": "original_fov_location", "FOVId": "fov_id", "seg_file_name": "2D_fov_tiff_path", }, dataset_name="2d_segmented_fields", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # structure scores as auxilary file score_files = [ Path(f"../structure_scores/structure_score_55000000{p}.csv") for p in (13, 14) ] score_dfs = [ pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns") for f in score_files ] df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False) df_score.to_csv(Path("../structure_scores/structure_scores.csv")) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["2D_fov_tiff_path"]) ds.set_extra_files( ["../channel_defs.json", "../structure_scores/structure_scores.csv"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_scrnaseq_data( test=False, csv_loc="scrnaseq_data_raw.csv", dataset_name="scrnaseq_data", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): df = pd.read_csv(csv_loc) # subsample features to make test if test: # write test matrix make_test_mtx(csv_loc=csv_loc) # make test manifest; counts only; no anndata df = pd.DataFrame({ "counts": [ "raw_counts_test.mtx", df["counts"][1], "cells_test.csv", "cells_test.csv", ] }) dataset_name = f"{dataset_name}_test" # create the dataset without supplementary files ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts"]) else: ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts", "anndata"]) # anndata object (h5ad) as supplementary files ds.set_extra_files([ "/allen/aics/gene-editing/RNA_seq/scRNAseq_SeeligCollaboration/2019_analysis/merged_experiment_1_2/scrnaseq_cardio_20191210.RData" ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_struct_scores_actn2_live( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20201012_actn2_live_classifier_with_metadata/live_manifest.csv", dataset_name="struct_scores_actn2_live", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) df["CellPath_x"] = df["CellPath_x"].str.replace( "singlecells", "/allen/aics/assay-dev/computational/data/cardio_pipeline_datastep/local_staging_pipeline_actn2/singlecells/singlecells", regex=False, ) df = df.drop(columns=[ "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath", "CellPath_y", "path", "image_name", "cell_id_filename", ]) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README_actn2_live.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["RawFilePath", "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath"]) ds.set_path_columns(["CellPath_x"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_autocontrasted_dataset( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv", col_name_map={}, dataset_name="2d_autocontrasted_fields_and_single_cells_actn2", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns( ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"]) ds.set_extra_files([ "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json", "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json", ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_struct_scores_actn2( test=False, csv_loc="/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/results_Fish/AssayDevFishAnalsysis-Handoff-transcript2protein.csv", dataset_name="struct_scores_actn2_2", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # only include new actn2 fish in this package -> 5500000322/323 imaged 2020-10 date = df["original_fov_location"].str.split("/", expand=True) df["date"] = date[7] df = df[df.date.isin(["20201002", "20201006"])] df = df.drop(columns=["date"]) # update result image dir (moved after processing) img_dir = "/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/output_Fish/" new_result_path = [ img_dir + Path(x).name for x in df["result_image_path"].tolist() ] df["result_image_path"] = new_result_path # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["result_image_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset( test=False, csv_loc="nonstructure_fov_manifest_for_quilt.csv", col_name_map={ "FOVId": "fov_id", "fov_path": "original_fov_location" }, dataset_name="2d_nonstructure_fields", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["merged_2D_fov_tiff_path"]) ds.set_extra_files(["channel_defs.json"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nuclear_masks( test=False, csv_loc=Path( "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv" ), dataset_name="2d_nuclear_masks", package_owner="calystay", s3_bucket="s3://allencell-internal-quilt", readme_path="README.md", ): # read in original csv df_in = pd.read_csv(csv_loc) # extract original_fov_location and nuc_mask_path from dataframe df = df_in[["original_fov_location", "nuc_mask_path"]] df = df.drop_duplicates() # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path=readme_path, ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["nuclear_mask_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
"### Global structure organization and local structural alignment features\n\n" ) for meta in metadata: for key, value in meta.items(): ftxt.write("- `{0}`: {1}\n".format( value["name"] if value["name"] is not None else key, value["description"], )) # Checking expected shape of the dataframe assert df.shape == (5161, 25) # Save a hand off version for the Modeling team df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv") # Upload to Quilt ds = Dataset( dataset="../results/AssayDevFishAnalsysis-Handoff.csv", name="assay_dev_fish_analysis", package_owner="matheus", readme_path="assay-dev-fish.md", ) # Set metadata and path columns ds.set_metadata_columns(["CellId"]) ds.set_path_columns(["result_image_path"]) # Send to Quilt pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev")
"tanyasg/2d_autocontrasted_single_cell_features", "s3://allencell-internal-quilt", ) df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"}) df_feat_inds = df_feat_inds.drop_duplicates() for index, row in df_feat_inds.iterrows(): df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1] for index, row in df.iterrows(): df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0] # merge df df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name']) df_new = df_new.set_index('index') # Upload to quilt test_df = df_new[0:2] ds = Dataset( dataset=df_new, name='3d_actn2_segmentation', package_owner='calystay', readme_path=r'C:\Users\calystay\Desktop\README.md', ) ds.set_metadata_columns(["original_fov_location"]) ds.set_path_columns(["struc_seg_path"]) ds.distribute( "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location" )
from quilt3distribute import Dataset # Read dataset df = pd.read_csv("../../fish_morphology_code/processing/structure_organization/results/AssayDevFishAnalsysis-Handoff.csv") # Define package ds = Dataset( dataset = df, name = "assay_dev_fish_analysis", package_owner = "matheus", readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md" ) # Metadata ds.set_metadata_columns(["CellId"]) ds.set_path_columns(['result_image_path']) # Send to Quilt pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev") # Distribute a test version as well df = df.sample(n=1) # Define package ds = Dataset( dataset = df, name = "assay_dev_fish_analysis_test", package_owner = "matheus", readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md" )
import pandas as pd from quilt3distribute import Dataset df = pd.read_csv( '/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv' ) df = df.drop(['Unnamed: 0'], axis=1) df = df.drop(['Unnamed: 0.1'], axis=1) df = df.rename(columns={ 'fov_path': 'original_fov_location', 'cell_num': 'napariCell_ObjectNumber' }) df = df[['nuc_mask_path', 'original_fov_location']] df = df.drop_duplicates() test_df = df.loc[0:2] ds = Dataset( dataset=test_df, name='2d_nuclear_masks_test', package_owner='calystay', readme_path=r'C:\Users\calystay\Desktop\README.md', ) ds.set_metadata_columns(["original_fov_location"]) ds.set_path_columns(["nuc_mask_path"]) ds.distribute("s3://allencell-internal-quilt", message="2D nuclear masks with original_fov_location")