コード例 #1
0
def distribute_seg_dataset(
    test=False,
    csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv",
    col_name_map={
        "fov_path": "original_fov_location",
        "FOVId": "fov_id",
        "seg_file_name": "2D_fov_tiff_path",
    },
    dataset_name="2d_segmented_fields",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # structure scores as auxilary file
    score_files = [
        Path(f"../structure_scores/structure_score_55000000{p}.csv")
        for p in (13, 14)
    ]
    score_dfs = [
        pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns")
        for f in score_files
    ]
    df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False)
    df_score.to_csv(Path("../structure_scores/structure_scores.csv"))

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["2D_fov_tiff_path"])
    ds.set_extra_files(
        ["../channel_defs.json", "../structure_scores/structure_scores.csv"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
コード例 #2
0
def distribute_autocontrasted_dataset(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv",
    col_name_map={},
    dataset_name="2d_autocontrasted_fields_and_single_cells_actn2",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")
    df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map,
                                                              axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(
        ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"])
    ds.set_extra_files([
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json",
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json",
    ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset(
    test=False,
    csv_loc="nonstructure_fov_manifest_for_quilt.csv",
    col_name_map={
        "FOVId": "fov_id",
        "fov_path": "original_fov_location"
    },
    dataset_name="2d_nonstructure_fields",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["merged_2D_fov_tiff_path"])
    ds.set_extra_files(["channel_defs.json"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
コード例 #4
0
def distribute_nuclear_masks(
    test=False,
    csv_loc=Path(
        "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv"
    ),
    dataset_name="2d_nuclear_masks",
    package_owner="calystay",
    s3_bucket="s3://allencell-internal-quilt",
    readme_path="README.md",
):

    # read in original csv
    df_in = pd.read_csv(csv_loc)

    # extract original_fov_location and nuc_mask_path from dataframe
    df = df_in[["original_fov_location", "nuc_mask_path"]]
    df = df.drop_duplicates()

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=readme_path,
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["nuclear_mask_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
コード例 #5
0
def test_validate(data, drop_on_error, expected_len):
    results = validate(data, drop_on_error=drop_on_error)

    if drop_on_error:
        assert len(results.data) == expected_len
コード例 #6
0
            "FeatureExplorerURL": fe_link
        })

# Create dataframe from rows
cell_id_to_fov_id_fe_link = pd.DataFrame(cell_id_to_fov_id_fe_link)

# Merge the dataframes into one
raw = raw.merge(cell_id_to_fov_id_fe_link,
                left_on="CellId",
                right_on="CellId",
                suffixes=("_raw", "_fe_link"))

# Step 3:
# Validate and prune the raw data
# We shouldn't lose any rows here but we are doing this as a safety measure
cleaned = validate(raw, drop_on_error=True)
print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.")

# Step 4:
# Send to dataset object for package construction
ds = Dataset(cleaned.data, "Pipeline Integrated Cell", "aics", "readme.md")

# Step 5:
# Add a license
ds.add_license("https://www.allencell.org/terms-of-use.html")

# Indicate column values to use for file metadata
ds.set_metadata_columns([
    "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm",
    "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId",
    "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow",