def distribute_struct_scores_bonus(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/manifest_20201007_tg.csv",
    dataset_name="struct_scores_bonus",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/README.md",
    )

    # set data path cols, metadata cols, and extra files
    #     ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["result_image_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def main():
    try:
        args = Args()

        # Create dataset
        ds = Dataset(dataset=args.dataset_path,
                     name=args.dataset_name,
                     package_owner=args.package_owner,
                     readme_path=args.readme_path)

        # Handle optional provided
        if args.usage_doc_or_link:
            ds.add_usage_doc(args.usage_doc_or_link)
        if args.license_doc_or_link:
            ds.add_license(args.license_doc_or_link)
        if args.metadata_columns:
            ds.set_metadata_columns(args.metadata_columns)
        if args.path_columns:
            ds.set_path_columns(args.path_columns)

        # Distribute
        pkg = ds.distribute(push_uri=args.push_uri, message=args.message)
        log.info(
            f"Completed distribution. "
            f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]"
        )

    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
Exemple #3
0
def extra_additions_dataset(example_frame, example_readme):
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)
    ds.set_path_columns(["2dReadPath"])
    ds.set_extra_files([example_readme])
    ds.set_column_names_map({"2dReadPath": "MappedPath"})
    ds.set_metadata_columns(["Structure"])
    return ds
Exemple #4
0
def distribute_seg_dataset(
    test=False,
    csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv",
    col_name_map={
        "fov_path": "original_fov_location",
        "FOVId": "fov_id",
        "seg_file_name": "2D_fov_tiff_path",
    },
    dataset_name="2d_segmented_fields",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # structure scores as auxilary file
    score_files = [
        Path(f"../structure_scores/structure_score_55000000{p}.csv")
        for p in (13, 14)
    ]
    score_dfs = [
        pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns")
        for f in score_files
    ]
    df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False)
    df_score.to_csv(Path("../structure_scores/structure_scores.csv"))

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["2D_fov_tiff_path"])
    ds.set_extra_files(
        ["../channel_defs.json", "../structure_scores/structure_scores.csv"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #5
0
def distribute_scrnaseq_data(
    test=False,
    csv_loc="scrnaseq_data_raw.csv",
    dataset_name="scrnaseq_data",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    df = pd.read_csv(csv_loc)

    # subsample features to make test
    if test:
        # write test matrix
        make_test_mtx(csv_loc=csv_loc)

        # make test manifest; counts only; no anndata
        df = pd.DataFrame({
            "counts": [
                "raw_counts_test.mtx",
                df["counts"][1],
                "cells_test.csv",
                "cells_test.csv",
            ]
        })

        dataset_name = f"{dataset_name}_test"

        # create the dataset without supplementary files
        ds = Dataset(
            dataset=df,
            name=dataset_name,
            package_owner=package_owner,
            readme_path="README.md",
        )

        # columns with files to upload
        ds.set_path_columns(["counts"])

    else:
        ds = Dataset(
            dataset=df,
            name=dataset_name,
            package_owner=package_owner,
            readme_path="README.md",
        )

        # columns with files to upload
        ds.set_path_columns(["counts", "anndata"])

        # anndata object (h5ad) as supplementary files
        ds.set_extra_files([
            "/allen/aics/gene-editing/RNA_seq/scRNAseq_SeeligCollaboration/2019_analysis/merged_experiment_1_2/scrnaseq_cardio_20191210.RData"
        ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def distribute_struct_scores_actn2_live(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20201012_actn2_live_classifier_with_metadata/live_manifest.csv",
    dataset_name="struct_scores_actn2_live",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)
    df["CellPath_x"] = df["CellPath_x"].str.replace(
        "singlecells",
        "/allen/aics/assay-dev/computational/data/cardio_pipeline_datastep/local_staging_pipeline_actn2/singlecells/singlecells",
        regex=False,
    )
    df = df.drop(columns=[
        "BackgroundPath",
        "ClassificationPath",
        "MemMaxProjectionPath",
        "MemSegmentationPath",
        "NucMaxProjectionPath",
        "StrMaxIntensitySlicePath",
        "CellPath_y",
        "path",
        "image_name",
        "cell_id_filename",
    ])

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README_actn2_live.md",
    )

    # set data path cols, metadata cols, and extra files
    # ds.set_metadata_columns(["RawFilePath", "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath"])
    ds.set_path_columns(["CellPath_x"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #7
0
def distribute_autocontrasted_dataset(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv",
    col_name_map={},
    dataset_name="2d_autocontrasted_fields_and_single_cells_actn2",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")
    df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map,
                                                              axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(
        ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"])
    ds.set_extra_files([
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json",
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json",
    ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #8
0
def distribute_struct_scores_actn2(
    test=False,
    csv_loc="/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/results_Fish/AssayDevFishAnalsysis-Handoff-transcript2protein.csv",
    dataset_name="struct_scores_actn2_2",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # only include new actn2 fish in this package -> 5500000322/323 imaged 2020-10
    date = df["original_fov_location"].str.split("/", expand=True)
    df["date"] = date[7]
    df = df[df.date.isin(["20201002", "20201006"])]
    df = df.drop(columns=["date"])

    # update result image dir (moved after processing)
    img_dir = "/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/output_Fish/"
    new_result_path = [
        img_dir + Path(x).name for x in df["result_image_path"].tolist()
    ]
    df["result_image_path"] = new_result_path

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README.md",
    )

    # set data path cols, metadata cols, and extra files
    #     ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["result_image_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset(
    test=False,
    csv_loc="nonstructure_fov_manifest_for_quilt.csv",
    col_name_map={
        "FOVId": "fov_id",
        "fov_path": "original_fov_location"
    },
    dataset_name="2d_nonstructure_fields",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["merged_2D_fov_tiff_path"])
    ds.set_extra_files(["channel_defs.json"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #10
0
def distribute_nuclear_masks(
    test=False,
    csv_loc=Path(
        "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv"
    ),
    dataset_name="2d_nuclear_masks",
    package_owner="calystay",
    s3_bucket="s3://allencell-internal-quilt",
    readme_path="README.md",
):

    # read in original csv
    df_in = pd.read_csv(csv_loc)

    # extract original_fov_location and nuc_mask_path from dataframe
    df = df_in[["original_fov_location", "nuc_mask_path"]]
    df = df.drop_duplicates()

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=readme_path,
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["nuclear_mask_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #11
0
        "### Global structure organization and local structural alignment features\n\n"
    )
    for meta in metadata:
        for key, value in meta.items():
            ftxt.write("- `{0}`: {1}\n".format(
                value["name"] if value["name"] is not None else key,
                value["description"],
            ))

# Checking expected shape of the dataframe
assert df.shape == (5161, 25)

# Save a hand off version for the Modeling team
df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv")

# Upload to Quilt
ds = Dataset(
    dataset="../results/AssayDevFishAnalsysis-Handoff.csv",
    name="assay_dev_fish_analysis",
    package_owner="matheus",
    readme_path="assay-dev-fish.md",
)

# Set metadata and path columns
ds.set_metadata_columns(["CellId"])
ds.set_path_columns(["result_image_path"])

# Send to Quilt
pkg = ds.distribute(push_uri="s3://allencell-internal-quilt",
                    message="Fish dataset by assay-dev")
        "tanyasg/2d_autocontrasted_single_cell_features",
        "s3://allencell-internal-quilt",
    )
df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"})
df_feat_inds = df_feat_inds.drop_duplicates()

for index, row in df_feat_inds.iterrows():
    df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1]

for index, row in df.iterrows():
    df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0]

# merge df
df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name'])
df_new = df_new.set_index('index')

# Upload to quilt
test_df = df_new[0:2]
ds = Dataset(
    dataset=df_new,
    name='3d_actn2_segmentation',
    package_owner='calystay',
    readme_path=r'C:\Users\calystay\Desktop\README.md',
)
ds.set_metadata_columns(["original_fov_location"])
ds.set_path_columns(["struc_seg_path"])
ds.distribute(
    "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location"
    )

from quilt3distribute import Dataset

# Read dataset
df = pd.read_csv("../../fish_morphology_code/processing/structure_organization/results/AssayDevFishAnalsysis-Handoff.csv")

# Define package
ds = Dataset(
    dataset = df,
    name = "assay_dev_fish_analysis",
    package_owner = "matheus",
    readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md"
)

# Metadata
ds.set_metadata_columns(["CellId"])
ds.set_path_columns(['result_image_path'])

# Send to Quilt
pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev")

# Distribute a test version as well
df = df.sample(n=1)

# Define package
ds = Dataset(
    dataset = df,
    name = "assay_dev_fish_analysis_test",
    package_owner = "matheus",
    readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md"
)
Exemple #14
0
import pandas as pd
from quilt3distribute import Dataset

df = pd.read_csv(
    '/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv'
)
df = df.drop(['Unnamed: 0'], axis=1)
df = df.drop(['Unnamed: 0.1'], axis=1)

df = df.rename(columns={
    'fov_path': 'original_fov_location',
    'cell_num': 'napariCell_ObjectNumber'
})

df = df[['nuc_mask_path', 'original_fov_location']]
df = df.drop_duplicates()

test_df = df.loc[0:2]
ds = Dataset(
    dataset=test_df,
    name='2d_nuclear_masks_test',
    package_owner='calystay',
    readme_path=r'C:\Users\calystay\Desktop\README.md',
)
ds.set_metadata_columns(["original_fov_location"])
ds.set_path_columns(["nuc_mask_path"])
ds.distribute("s3://allencell-internal-quilt",
              message="2D nuclear masks with original_fov_location")