def main():
    try:
        args = Args()

        # Create dataset
        ds = Dataset(dataset=args.dataset_path,
                     name=args.dataset_name,
                     package_owner=args.package_owner,
                     readme_path=args.readme_path)

        # Handle optional provided
        if args.usage_doc_or_link:
            ds.add_usage_doc(args.usage_doc_or_link)
        if args.license_doc_or_link:
            ds.add_license(args.license_doc_or_link)
        if args.metadata_columns:
            ds.set_metadata_columns(args.metadata_columns)
        if args.path_columns:
            ds.set_path_columns(args.path_columns)

        # Distribute
        pkg = ds.distribute(push_uri=args.push_uri, message=args.message)
        log.info(
            f"Completed distribution. "
            f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]"
        )

    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
Exemple #2
0
def test_dataset_auto_metadata_grouping_repeated_values(
        repeated_values_frame, example_readme):
    """
    Because the repeated values dataset has three unique files but has nine rows of data, this function
    checks that there are only three files passed to the package object but that each file has a list of the unique
    CellIds but that because all the structures are the same per file, that the structure has been reduced to a single
    value.
    """
    # Create dataset from frame
    ds = Dataset(repeated_values_frame, "test_dataset", "me", example_readme)
    ds.set_metadata_columns(["CellId", "Structure"])

    # Generate package
    pkg = ds.distribute()

    # Check file groupings available
    assert set(pkg.keys()) == {
        "SourceReadPath", "README.md", "metadata.csv", "referenced_files"
    }

    # Check that only three tiffs were attached to package
    assert len(pkg["SourceReadPath"]) == 3

    # Check that CellId is a list because of repeated values but that Structure is a string because always unique
    for f in pkg["SourceReadPath"]:
        assert isinstance(pkg["SourceReadPath"][f].meta["CellId"], list)
        assert isinstance(pkg["SourceReadPath"][f].meta["Structure"], str)
Exemple #3
0
def extra_additions_dataset(example_frame, example_readme):
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)
    ds.set_path_columns(["2dReadPath"])
    ds.set_extra_files([example_readme])
    ds.set_column_names_map({"2dReadPath": "MappedPath"})
    ds.set_metadata_columns(["Structure"])
    return ds
Exemple #4
0
def distribute_seg_dataset(
    test=False,
    csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv",
    col_name_map={
        "fov_path": "original_fov_location",
        "FOVId": "fov_id",
        "seg_file_name": "2D_fov_tiff_path",
    },
    dataset_name="2d_segmented_fields",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # structure scores as auxilary file
    score_files = [
        Path(f"../structure_scores/structure_score_55000000{p}.csv")
        for p in (13, 14)
    ]
    score_dfs = [
        pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns")
        for f in score_files
    ]
    df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False)
    df_score.to_csv(Path("../structure_scores/structure_scores.csv"))

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["2D_fov_tiff_path"])
    ds.set_extra_files(
        ["../channel_defs.json", "../structure_scores/structure_scores.csv"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #5
0
def test_dataset_metadata_numpy_type_casting(example_frame, example_readme):
    # Add numpy column to frame
    example_frame["NumpyTypes"] = np.zeros(9)
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)

    # Add column filled with numpy types to index
    ds.set_metadata_columns(["NumpyTypes"])

    # Just run distribute to make sure that numpy types are cast fine
    ds.distribute()
Exemple #6
0
def distribute_cellprofiler_features(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/features2quilt.csv",
    dataset_name="2d_autocontrasted_single_cell_features_actn2_2",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):
    df = pd.read_csv(csv_loc)

    # subsample features to make test
    if test:
        # write test feature csv and test image counts csv
        make_test_csv(csv_loc=csv_loc)
        cell_line = df["cell_line"][0]
        cellprofiler_id = df["cellprofiler_id"][0]

        # make test manifest
        df = pd.DataFrame({
            "feature_file": ["cp_features_test.csv"],
            "image_object_count_file": ["image_object_counts_test.csv"],
            "cell_line": [cell_line],
            "cellprofiler_id": [cellprofiler_id],
        })

        dataset_name = f"{dataset_name}_test"

    # Create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/README.md",
    )

    # Optionally add common additional requirements
    ds.add_usage_doc(
        "https://docs.quiltdata.com/walkthrough/reading-from-a-package")
    ds.add_license("https://www.allencell.org/terms-of-use.html")

    # Optionally indicate column values to use for file metadata
    ds.set_metadata_columns(["cell_line", "cellprofiler_id"])

    # Optionally rename the columns on the package level
    ds.set_column_names_map({
        "feature_file": "features",
        "image_object_count_file": "object_counts"
    })

    # add commit hash to message
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    # Distribute
    ds.distribute(push_uri=s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #7
0
def test_dataset_metadata_non_json_serializable_type(example_frame,
                                                     example_readme):
    # Add non json serializable type to dataframe
    example_frame["BadType"] = [SomeDummyObject(i) for i in range(9)]
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)

    # Add column filled with non serializable type to index
    ds.set_metadata_columns(["BadType"])

    # Check non json serializable type check fails
    with pytest.raises(TypeError):
        ds.distribute()
Exemple #8
0
def distribute_autocontrasted_dataset(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv",
    col_name_map={},
    dataset_name="2d_autocontrasted_fields_and_single_cells_actn2",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")
    df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map,
                                                              axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(
        ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"])
    ds.set_extra_files([
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json",
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json",
    ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset(
    test=False,
    csv_loc="nonstructure_fov_manifest_for_quilt.csv",
    col_name_map={
        "FOVId": "fov_id",
        "fov_path": "original_fov_location"
    },
    dataset_name="2d_nonstructure_fields",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["merged_2D_fov_tiff_path"])
    ds.set_extra_files(["channel_defs.json"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #10
0
def distribute_nuclear_masks(
    test=False,
    csv_loc=Path(
        "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv"
    ),
    dataset_name="2d_nuclear_masks",
    package_owner="calystay",
    s3_bucket="s3://allencell-internal-quilt",
    readme_path="README.md",
):

    # read in original csv
    df_in = pd.read_csv(csv_loc)

    # extract original_fov_location and nuc_mask_path from dataframe
    df = df_in[["original_fov_location", "nuc_mask_path"]]
    df = df.drop_duplicates()

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=readme_path,
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["nuclear_mask_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #11
0
# We shouldn't lose any rows here but we are doing this as a safety measure
cleaned = validate(raw, drop_on_error=True)
print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.")

# Step 4:
# Send to dataset object for package construction
ds = Dataset(cleaned.data, "Pipeline Integrated Cell", "aics", "readme.md")

# Step 5:
# Add a license
ds.add_license("https://www.allencell.org/terms-of-use.html")

# Indicate column values to use for file metadata
ds.set_metadata_columns([
    "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm",
    "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId",
    "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow",
    "FeatureExplorerURL"
])

# Set produced package directory naming
ds.set_column_names_map({
    "MembraneContourReadPath":
    "membrane_contours",
    "MembraneSegmentationReadPath":
    "membrane_segmentations",
    "NucleusContourReadPath":
    "dna_contours",
    "NucleusSegmentationReadPath":
    "dna_segmentations",
    "SourceReadPath":
    "fovs",
Exemple #12
0
        "### Global structure organization and local structural alignment features\n\n"
    )
    for meta in metadata:
        for key, value in meta.items():
            ftxt.write("- `{0}`: {1}\n".format(
                value["name"] if value["name"] is not None else key,
                value["description"],
            ))

# Checking expected shape of the dataframe
assert df.shape == (5161, 25)

# Save a hand off version for the Modeling team
df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv")

# Upload to Quilt
ds = Dataset(
    dataset="../results/AssayDevFishAnalsysis-Handoff.csv",
    name="assay_dev_fish_analysis",
    package_owner="matheus",
    readme_path="assay-dev-fish.md",
)

# Set metadata and path columns
ds.set_metadata_columns(["CellId"])
ds.set_path_columns(["result_image_path"])

# Send to Quilt
pkg = ds.distribute(push_uri="s3://allencell-internal-quilt",
                    message="Fish dataset by assay-dev")
        "tanyasg/2d_autocontrasted_single_cell_features",
        "s3://allencell-internal-quilt",
    )
df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"})
df_feat_inds = df_feat_inds.drop_duplicates()

for index, row in df_feat_inds.iterrows():
    df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1]

for index, row in df.iterrows():
    df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0]

# merge df
df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name'])
df_new = df_new.set_index('index')

# Upload to quilt
test_df = df_new[0:2]
ds = Dataset(
    dataset=df_new,
    name='3d_actn2_segmentation',
    package_owner='calystay',
    readme_path=r'C:\Users\calystay\Desktop\README.md',
)
ds.set_metadata_columns(["original_fov_location"])
ds.set_path_columns(["struc_seg_path"])
ds.distribute(
    "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location"
    )