def main(): try: args = Args() # Create dataset ds = Dataset(dataset=args.dataset_path, name=args.dataset_name, package_owner=args.package_owner, readme_path=args.readme_path) # Handle optional provided if args.usage_doc_or_link: ds.add_usage_doc(args.usage_doc_or_link) if args.license_doc_or_link: ds.add_license(args.license_doc_or_link) if args.metadata_columns: ds.set_metadata_columns(args.metadata_columns) if args.path_columns: ds.set_path_columns(args.path_columns) # Distribute pkg = ds.distribute(push_uri=args.push_uri, message=args.message) log.info( f"Completed distribution. " f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]" ) except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def test_dataset_auto_metadata_grouping_repeated_values( repeated_values_frame, example_readme): """ Because the repeated values dataset has three unique files but has nine rows of data, this function checks that there are only three files passed to the package object but that each file has a list of the unique CellIds but that because all the structures are the same per file, that the structure has been reduced to a single value. """ # Create dataset from frame ds = Dataset(repeated_values_frame, "test_dataset", "me", example_readme) ds.set_metadata_columns(["CellId", "Structure"]) # Generate package pkg = ds.distribute() # Check file groupings available assert set(pkg.keys()) == { "SourceReadPath", "README.md", "metadata.csv", "referenced_files" } # Check that only three tiffs were attached to package assert len(pkg["SourceReadPath"]) == 3 # Check that CellId is a list because of repeated values but that Structure is a string because always unique for f in pkg["SourceReadPath"]: assert isinstance(pkg["SourceReadPath"][f].meta["CellId"], list) assert isinstance(pkg["SourceReadPath"][f].meta["Structure"], str)
def extra_additions_dataset(example_frame, example_readme): ds = Dataset(example_frame, "test_dataset", "me", example_readme) ds.set_path_columns(["2dReadPath"]) ds.set_extra_files([example_readme]) ds.set_column_names_map({"2dReadPath": "MappedPath"}) ds.set_metadata_columns(["Structure"]) return ds
def distribute_seg_dataset( test=False, csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv", col_name_map={ "fov_path": "original_fov_location", "FOVId": "fov_id", "seg_file_name": "2D_fov_tiff_path", }, dataset_name="2d_segmented_fields", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # structure scores as auxilary file score_files = [ Path(f"../structure_scores/structure_score_55000000{p}.csv") for p in (13, 14) ] score_dfs = [ pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns") for f in score_files ] df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False) df_score.to_csv(Path("../structure_scores/structure_scores.csv")) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["2D_fov_tiff_path"]) ds.set_extra_files( ["../channel_defs.json", "../structure_scores/structure_scores.csv"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def test_dataset_metadata_numpy_type_casting(example_frame, example_readme): # Add numpy column to frame example_frame["NumpyTypes"] = np.zeros(9) ds = Dataset(example_frame, "test_dataset", "me", example_readme) # Add column filled with numpy types to index ds.set_metadata_columns(["NumpyTypes"]) # Just run distribute to make sure that numpy types are cast fine ds.distribute()
def distribute_cellprofiler_features( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/features2quilt.csv", dataset_name="2d_autocontrasted_single_cell_features_actn2_2", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): df = pd.read_csv(csv_loc) # subsample features to make test if test: # write test feature csv and test image counts csv make_test_csv(csv_loc=csv_loc) cell_line = df["cell_line"][0] cellprofiler_id = df["cellprofiler_id"][0] # make test manifest df = pd.DataFrame({ "feature_file": ["cp_features_test.csv"], "image_object_count_file": ["image_object_counts_test.csv"], "cell_line": [cell_line], "cellprofiler_id": [cellprofiler_id], }) dataset_name = f"{dataset_name}_test" # Create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/README.md", ) # Optionally add common additional requirements ds.add_usage_doc( "https://docs.quiltdata.com/walkthrough/reading-from-a-package") ds.add_license("https://www.allencell.org/terms-of-use.html") # Optionally indicate column values to use for file metadata ds.set_metadata_columns(["cell_line", "cellprofiler_id"]) # Optionally rename the columns on the package level ds.set_column_names_map({ "feature_file": "features", "image_object_count_file": "object_counts" }) # add commit hash to message label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) # Distribute ds.distribute(push_uri=s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def test_dataset_metadata_non_json_serializable_type(example_frame, example_readme): # Add non json serializable type to dataframe example_frame["BadType"] = [SomeDummyObject(i) for i in range(9)] ds = Dataset(example_frame, "test_dataset", "me", example_readme) # Add column filled with non serializable type to index ds.set_metadata_columns(["BadType"]) # Check non json serializable type check fails with pytest.raises(TypeError): ds.distribute()
def distribute_autocontrasted_dataset( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv", col_name_map={}, dataset_name="2d_autocontrasted_fields_and_single_cells_actn2", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns( ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"]) ds.set_extra_files([ "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json", "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json", ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset( test=False, csv_loc="nonstructure_fov_manifest_for_quilt.csv", col_name_map={ "FOVId": "fov_id", "fov_path": "original_fov_location" }, dataset_name="2d_nonstructure_fields", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["merged_2D_fov_tiff_path"]) ds.set_extra_files(["channel_defs.json"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nuclear_masks( test=False, csv_loc=Path( "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv" ), dataset_name="2d_nuclear_masks", package_owner="calystay", s3_bucket="s3://allencell-internal-quilt", readme_path="README.md", ): # read in original csv df_in = pd.read_csv(csv_loc) # extract original_fov_location and nuc_mask_path from dataframe df = df_in[["original_fov_location", "nuc_mask_path"]] df = df.drop_duplicates() # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path=readme_path, ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["nuclear_mask_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
# We shouldn't lose any rows here but we are doing this as a safety measure cleaned = validate(raw, drop_on_error=True) print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.") # Step 4: # Send to dataset object for package construction ds = Dataset(cleaned.data, "Pipeline Integrated Cell", "aics", "readme.md") # Step 5: # Add a license ds.add_license("https://www.allencell.org/terms-of-use.html") # Indicate column values to use for file metadata ds.set_metadata_columns([ "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm", "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId", "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow", "FeatureExplorerURL" ]) # Set produced package directory naming ds.set_column_names_map({ "MembraneContourReadPath": "membrane_contours", "MembraneSegmentationReadPath": "membrane_segmentations", "NucleusContourReadPath": "dna_contours", "NucleusSegmentationReadPath": "dna_segmentations", "SourceReadPath": "fovs",
"### Global structure organization and local structural alignment features\n\n" ) for meta in metadata: for key, value in meta.items(): ftxt.write("- `{0}`: {1}\n".format( value["name"] if value["name"] is not None else key, value["description"], )) # Checking expected shape of the dataframe assert df.shape == (5161, 25) # Save a hand off version for the Modeling team df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv") # Upload to Quilt ds = Dataset( dataset="../results/AssayDevFishAnalsysis-Handoff.csv", name="assay_dev_fish_analysis", package_owner="matheus", readme_path="assay-dev-fish.md", ) # Set metadata and path columns ds.set_metadata_columns(["CellId"]) ds.set_path_columns(["result_image_path"]) # Send to Quilt pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev")
"tanyasg/2d_autocontrasted_single_cell_features", "s3://allencell-internal-quilt", ) df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"}) df_feat_inds = df_feat_inds.drop_duplicates() for index, row in df_feat_inds.iterrows(): df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1] for index, row in df.iterrows(): df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0] # merge df df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name']) df_new = df_new.set_index('index') # Upload to quilt test_df = df_new[0:2] ds = Dataset( dataset=df_new, name='3d_actn2_segmentation', package_owner='calystay', readme_path=r'C:\Users\calystay\Desktop\README.md', ) ds.set_metadata_columns(["original_fov_location"]) ds.set_path_columns(["struc_seg_path"]) ds.distribute( "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location" )