Exemple #1
0
def test_dataset_auto_metadata_grouping_repeated_values(
        repeated_values_frame, example_readme):
    """
    Because the repeated values dataset has three unique files but has nine rows of data, this function
    checks that there are only three files passed to the package object but that each file has a list of the unique
    CellIds but that because all the structures are the same per file, that the structure has been reduced to a single
    value.
    """
    # Create dataset from frame
    ds = Dataset(repeated_values_frame, "test_dataset", "me", example_readme)
    ds.set_metadata_columns(["CellId", "Structure"])

    # Generate package
    pkg = ds.distribute()

    # Check file groupings available
    assert set(pkg.keys()) == {
        "SourceReadPath", "README.md", "metadata.csv", "referenced_files"
    }

    # Check that only three tiffs were attached to package
    assert len(pkg["SourceReadPath"]) == 3

    # Check that CellId is a list because of repeated values but that Structure is a string because always unique
    for f in pkg["SourceReadPath"]:
        assert isinstance(pkg["SourceReadPath"][f].meta["CellId"], list)
        assert isinstance(pkg["SourceReadPath"][f].meta["Structure"], str)
Exemple #2
0
def extra_additions_dataset(example_frame, example_readme):
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)
    ds.set_path_columns(["2dReadPath"])
    ds.set_extra_files([example_readme])
    ds.set_column_names_map({"2dReadPath": "MappedPath"})
    ds.set_metadata_columns(["Structure"])
    return ds
Exemple #3
0
def distribute_seg_dataset(
    test=False,
    csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv",
    col_name_map={
        "fov_path": "original_fov_location",
        "FOVId": "fov_id",
        "seg_file_name": "2D_fov_tiff_path",
    },
    dataset_name="2d_segmented_fields",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # structure scores as auxilary file
    score_files = [
        Path(f"../structure_scores/structure_score_55000000{p}.csv")
        for p in (13, 14)
    ]
    score_dfs = [
        pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns")
        for f in score_files
    ]
    df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False)
    df_score.to_csv(Path("../structure_scores/structure_scores.csv"))

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["2D_fov_tiff_path"])
    ds.set_extra_files(
        ["../channel_defs.json", "../structure_scores/structure_scores.csv"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #4
0
def distribute_autocontrasted_dataset(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv",
    col_name_map={},
    dataset_name="2d_autocontrasted_fields_and_single_cells_actn2",
    package_owner="rorydm",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")
    df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map,
                                                              axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(
        ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"])
    ds.set_extra_files([
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json",
        "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json",
    ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #5
0
def test_dataset_file_grouping_with_matching_names(same_filenames_frame,
                                                   example_readme):
    # Create dataset from frame
    ds = Dataset(same_filenames_frame, "test_dataset", "me", example_readme)

    # Generate package
    pkg = ds.distribute()

    # Check file groupings available
    assert set(pkg.keys()) == {
        "SourceReadPath", "README.md", "metadata.csv", "referenced_files"
    }

    # Check that 18 unique files were attached to package
    assert len(pkg["SourceReadPath"]) == 18
def distribute_nonstructure_dataset(
    test=False,
    csv_loc="nonstructure_fov_manifest_for_quilt.csv",
    col_name_map={
        "FOVId": "fov_id",
        "fov_path": "original_fov_location"
    },
    dataset_name="2d_nonstructure_fields",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # rename some cols
    df = df.rename(col_name_map, axis="columns")

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path="README.md",
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["merged_2D_fov_tiff_path"])
    ds.set_extra_files(["channel_defs.json"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
def distribute_struct_scores_bonus(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/manifest_20201007_tg.csv",
    dataset_name="struct_scores_bonus",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/README.md",
    )

    # set data path cols, metadata cols, and extra files
    #     ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["result_image_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #8
0
def test_dataset_metadata_numpy_type_casting(example_frame, example_readme):
    # Add numpy column to frame
    example_frame["NumpyTypes"] = np.zeros(9)
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)

    # Add column filled with numpy types to index
    ds.set_metadata_columns(["NumpyTypes"])

    # Just run distribute to make sure that numpy types are cast fine
    ds.distribute()
Exemple #9
0
def distribute_nuclear_masks(
    test=False,
    csv_loc=Path(
        "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv"
    ),
    dataset_name="2d_nuclear_masks",
    package_owner="calystay",
    s3_bucket="s3://allencell-internal-quilt",
    readme_path="README.md",
):

    # read in original csv
    df_in = pd.read_csv(csv_loc)

    # extract original_fov_location and nuc_mask_path from dataframe
    df = df_in[["original_fov_location", "nuc_mask_path"]]
    df = df.drop_duplicates()

    # drop any cols with missing data
    vds = validate(df, drop_on_error=True)
    df = vds.data.reset_index(drop=True)

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=readme_path,
    )

    # set data path cols, metadata cols, and extra files
    ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["nuclear_mask_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #10
0
def test_dataset_metadata_non_json_serializable_type(example_frame,
                                                     example_readme):
    # Add non json serializable type to dataframe
    example_frame["BadType"] = [SomeDummyObject(i) for i in range(9)]
    ds = Dataset(example_frame, "test_dataset", "me", example_readme)

    # Add column filled with non serializable type to index
    ds.set_metadata_columns(["BadType"])

    # Check non json serializable type check fails
    with pytest.raises(TypeError):
        ds.distribute()
def distribute_struct_scores_actn2_live(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20201012_actn2_live_classifier_with_metadata/live_manifest.csv",
    dataset_name="struct_scores_actn2_live",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)
    df["CellPath_x"] = df["CellPath_x"].str.replace(
        "singlecells",
        "/allen/aics/assay-dev/computational/data/cardio_pipeline_datastep/local_staging_pipeline_actn2/singlecells/singlecells",
        regex=False,
    )
    df = df.drop(columns=[
        "BackgroundPath",
        "ClassificationPath",
        "MemMaxProjectionPath",
        "MemSegmentationPath",
        "NucMaxProjectionPath",
        "StrMaxIntensitySlicePath",
        "CellPath_y",
        "path",
        "image_name",
        "cell_id_filename",
    ])

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README_actn2_live.md",
    )

    # set data path cols, metadata cols, and extra files
    # ds.set_metadata_columns(["RawFilePath", "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath"])
    ds.set_path_columns(["CellPath_x"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #12
0
def distribute_struct_scores_actn2(
    test=False,
    csv_loc="/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/results_Fish/AssayDevFishAnalsysis-Handoff-transcript2protein.csv",
    dataset_name="struct_scores_actn2_2",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    # read in original csv
    df = pd.read_csv(csv_loc)

    # only include new actn2 fish in this package -> 5500000322/323 imaged 2020-10
    date = df["original_fov_location"].str.split("/", expand=True)
    df["date"] = date[7]
    df = df[df.date.isin(["20201002", "20201006"])]
    df = df.drop(columns=["date"])

    # update result image dir (moved after processing)
    img_dir = "/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/output_Fish/"
    new_result_path = [
        img_dir + Path(x).name for x in df["result_image_path"].tolist()
    ]
    df["result_image_path"] = new_result_path

    # subsample df for eg a test dataset
    if test:
        df = df.sample(2, random_state=0)
        dataset_name = f"{dataset_name}_test"

    # create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README.md",
    )

    # set data path cols, metadata cols, and extra files
    #     ds.set_metadata_columns(["fov_id", "original_fov_location"])
    ds.set_path_columns(["result_image_path"])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #13
0
def test_dataset_init_fail_csv_is_dir(data_dir, example_readme):
    with pytest.raises(IsADirectoryError):
        Dataset(data_dir, "test_dataset", "me", example_readme)
def main():
    try:
        args = Args()

        # Create dataset
        ds = Dataset(dataset=args.dataset_path,
                     name=args.dataset_name,
                     package_owner=args.package_owner,
                     readme_path=args.readme_path)

        # Handle optional provided
        if args.usage_doc_or_link:
            ds.add_usage_doc(args.usage_doc_or_link)
        if args.license_doc_or_link:
            ds.add_license(args.license_doc_or_link)
        if args.metadata_columns:
            ds.set_metadata_columns(args.metadata_columns)
        if args.path_columns:
            ds.set_path_columns(args.path_columns)

        # Distribute
        pkg = ds.distribute(push_uri=args.push_uri, message=args.message)
        log.info(
            f"Completed distribution. "
            f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]"
        )

    except Exception as e:
        log.error("=============================================")
        if args.debug:
            log.error("\n\n" + traceback.format_exc())
            log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)
Exemple #15
0
        "### Global structure organization and local structural alignment features\n\n"
    )
    for meta in metadata:
        for key, value in meta.items():
            ftxt.write("- `{0}`: {1}\n".format(
                value["name"] if value["name"] is not None else key,
                value["description"],
            ))

# Checking expected shape of the dataframe
assert df.shape == (5161, 25)

# Save a hand off version for the Modeling team
df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv")

# Upload to Quilt
ds = Dataset(
    dataset="../results/AssayDevFishAnalsysis-Handoff.csv",
    name="assay_dev_fish_analysis",
    package_owner="matheus",
    readme_path="assay-dev-fish.md",
)

# Set metadata and path columns
ds.set_metadata_columns(["CellId"])
ds.set_path_columns(["result_image_path"])

# Send to Quilt
pkg = ds.distribute(push_uri="s3://allencell-internal-quilt",
                    message="Fish dataset by assay-dev")
Exemple #16
0
def test_dataset_init_types(example_readme, dataset):
    Dataset(dataset, "test_dataset", "me", example_readme)
        "tanyasg/2d_autocontrasted_single_cell_features",
        "s3://allencell-internal-quilt",
    )
df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"})
df_feat_inds = df_feat_inds.drop_duplicates()

for index, row in df_feat_inds.iterrows():
    df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1]

for index, row in df.iterrows():
    df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0]

# merge df
df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name'])
df_new = df_new.set_index('index')

# Upload to quilt
test_df = df_new[0:2]
ds = Dataset(
    dataset=df_new,
    name='3d_actn2_segmentation',
    package_owner='calystay',
    readme_path=r'C:\Users\calystay\Desktop\README.md',
)
ds.set_metadata_columns(["original_fov_location"])
ds.set_path_columns(["struc_seg_path"])
ds.distribute(
    "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location"
    )

import pandas as pd
from quilt3distribute import Dataset

# Read dataset
df = pd.read_csv("../../fish_morphology_code/processing/structure_organization/results/AssayDevFishAnalsysis-Handoff.csv")

# Define package
ds = Dataset(
    dataset = df,
    name = "assay_dev_fish_analysis",
    package_owner = "matheus",
    readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md"
)

# Metadata
ds.set_metadata_columns(["CellId"])
ds.set_path_columns(['result_image_path'])

# Send to Quilt
pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev")

# Distribute a test version as well
df = df.sample(n=1)

# Define package
ds = Dataset(
    dataset = df,
    name = "assay_dev_fish_analysis_test",
    package_owner = "matheus",
    readme_path = "../../fish_morphology_code/processing/structure_organization/tools/assay-dev-fish.md"
)
    image_name = row['image_name']
    location = list(
        set(df_feat_inds.loc[df_feat_inds['image_name'] == image_name,
                             'original_fov_location']))[0]
    df.loc[index, 'original_fov_location'] = location

plot_df = plot_ds.merge(
    right=df,
    left_on=['FOV path', 'Cell number'],
    right_on=['original_fov_location', 'napariCell_ObjectNumber'])

plot_df = plot_df[[
    'original_fov_location', 'napariCell_ObjectNumber',
    'seg_561_cell_dist_nuc_per_obj_median',
    'seg_638_cell_dist_nuc_per_obj_median'
]]

plot_df.to_csv('probe_localization_for_plot.csv')

test_df = df.loc[0:2]
ds = Dataset(
    dataset=df,
    name='probe_localization',
    package_owner='calystay',
    readme_path='C:/Users/calystay/Desktop/README.md',
)
ds.set_extra_files(['probe_localization_for_plot.csv'])
ds.set_metadata_columns(["original_fov_location"])
ds.distribute("s3://allencell-internal-quilt",
              message="probe localization with original_fov_location")
Exemple #20
0
import pandas as pd
from quilt3distribute import Dataset

df = pd.read_csv(
    '/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv'
)
df = df.drop(['Unnamed: 0'], axis=1)
df = df.drop(['Unnamed: 0.1'], axis=1)

df = df.rename(columns={
    'fov_path': 'original_fov_location',
    'cell_num': 'napariCell_ObjectNumber'
})

df = df[['nuc_mask_path', 'original_fov_location']]
df = df.drop_duplicates()

test_df = df.loc[0:2]
ds = Dataset(
    dataset=test_df,
    name='2d_nuclear_masks_test',
    package_owner='calystay',
    readme_path=r'C:\Users\calystay\Desktop\README.md',
)
ds.set_metadata_columns(["original_fov_location"])
ds.set_path_columns(["nuc_mask_path"])
ds.distribute("s3://allencell-internal-quilt",
              message="2D nuclear masks with original_fov_location")
Exemple #21
0
def test_dataset_init_csv(example_csv, example_readme):
    Dataset(example_csv, "test_dataset", "me", example_readme)
Exemple #22
0
def test_dataset_init_frame(example_frame, example_readme):
    Dataset(example_frame, "test_dataset", "me", example_readme)
Exemple #23
0
def test_dataset_init_fail_csv_does_not_exist(example_readme, dataset):
    Dataset(dataset, "test_dataset", "me", example_readme)
Exemple #24
0
def test_dataset_init_fail_readme_does_not_exist(example_frame, readme_path):
    Dataset(example_frame, "test_dataset", "me", readme_path)
Exemple #25
0
def test_dataset_init_fail_readme_is_dir(example_frame, data_dir):
    with pytest.raises(IsADirectoryError):
        Dataset(example_frame, "test_dataset", "me", data_dir)
Exemple #26
0
# Merge the dataframes into one
raw = raw.merge(cell_id_to_fov_id_fe_link,
                left_on="CellId",
                right_on="CellId",
                suffixes=("_raw", "_fe_link"))

# Step 3:
# Validate and prune the raw data
# We shouldn't lose any rows here but we are doing this as a safety measure
cleaned = validate(raw, drop_on_error=True)
print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.")

# Step 4:
# Send to dataset object for package construction
ds = Dataset(cleaned.data, "Pipeline Integrated Cell", "aics", "readme.md")

# Step 5:
# Add a license
ds.add_license("https://www.allencell.org/terms-of-use.html")

# Indicate column values to use for file metadata
ds.set_metadata_columns([
    "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm",
    "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId",
    "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow",
    "FeatureExplorerURL"
])

# Set produced package directory naming
ds.set_column_names_map({
    return "https://www.allencell.org/cell-feature-explorer.html?cellSelectedFor3D={}".format(
        row["CellId"])


raw["FeatureExplorerURL"] = raw.apply(create_feature_explorer_url, axis=1)

# Step 3:
# Validate and prune the raw data
# During the prune operation we lose ~16 rows of data to missing single cell feature files
# We are still investigating this...
cleaned = validate(raw, drop_on_error=True)
print(f"Dropped {len(raw) - len(cleaned.data)} rows during validation.")

# Step 4:
# Send to dataset object for package construction
ds = Dataset(cleaned.data, "Pipeline Integrated Single Cell", "aics",
             "readme.md")

# Step 5:
# Add a license
ds.add_license("https://www.allencell.org/terms-of-use.html")

# Indicate column values to use for file metadata
ds.set_metadata_columns([
    "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm",
    "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId",
    "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow",
    "FeatureExplorerURL"
])

# Set produced package directory naming
ds.set_column_names_map({
Exemple #28
0
def test_dataset_return_or_raise_approved_name(example_frame, example_readme,
                                               name):
    Dataset(example_frame, name, "me", example_readme)
Exemple #29
0
def distribute_cellprofiler_features(
    test=False,
    csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/features2quilt.csv",
    dataset_name="2d_autocontrasted_single_cell_features_actn2_2",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):
    df = pd.read_csv(csv_loc)

    # subsample features to make test
    if test:
        # write test feature csv and test image counts csv
        make_test_csv(csv_loc=csv_loc)
        cell_line = df["cell_line"][0]
        cellprofiler_id = df["cellprofiler_id"][0]

        # make test manifest
        df = pd.DataFrame({
            "feature_file": ["cp_features_test.csv"],
            "image_object_count_file": ["image_object_counts_test.csv"],
            "cell_line": [cell_line],
            "cellprofiler_id": [cellprofiler_id],
        })

        dataset_name = f"{dataset_name}_test"

    # Create the dataset
    ds = Dataset(
        dataset=df,
        name=dataset_name,
        package_owner=package_owner,
        readme_path=
        "/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/README.md",
    )

    # Optionally add common additional requirements
    ds.add_usage_doc(
        "https://docs.quiltdata.com/walkthrough/reading-from-a-package")
    ds.add_license("https://www.allencell.org/terms-of-use.html")

    # Optionally indicate column values to use for file metadata
    ds.set_metadata_columns(["cell_line", "cellprofiler_id"])

    # Optionally rename the columns on the package level
    ds.set_column_names_map({
        "feature_file": "features",
        "image_object_count_file": "object_counts"
    })

    # add commit hash to message
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    # Distribute
    ds.distribute(push_uri=s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")
Exemple #30
0
def distribute_scrnaseq_data(
    test=False,
    csv_loc="scrnaseq_data_raw.csv",
    dataset_name="scrnaseq_data",
    package_owner="tanyasg",
    s3_bucket="s3://allencell-internal-quilt",
):

    df = pd.read_csv(csv_loc)

    # subsample features to make test
    if test:
        # write test matrix
        make_test_mtx(csv_loc=csv_loc)

        # make test manifest; counts only; no anndata
        df = pd.DataFrame({
            "counts": [
                "raw_counts_test.mtx",
                df["counts"][1],
                "cells_test.csv",
                "cells_test.csv",
            ]
        })

        dataset_name = f"{dataset_name}_test"

        # create the dataset without supplementary files
        ds = Dataset(
            dataset=df,
            name=dataset_name,
            package_owner=package_owner,
            readme_path="README.md",
        )

        # columns with files to upload
        ds.set_path_columns(["counts"])

    else:
        ds = Dataset(
            dataset=df,
            name=dataset_name,
            package_owner=package_owner,
            readme_path="README.md",
        )

        # columns with files to upload
        ds.set_path_columns(["counts", "anndata"])

        # anndata object (h5ad) as supplementary files
        ds.set_extra_files([
            "/allen/aics/gene-editing/RNA_seq/scRNAseq_SeeligCollaboration/2019_analysis/merged_experiment_1_2/scrnaseq_cardio_20191210.RData"
        ])

    # tag with commit hash
    label = (subprocess.check_output(["git", "rev-parse",
                                      "HEAD"]).strip().decode("utf-8"))
    ds.distribute(s3_bucket,
                  message=f"git commit hash of fish_morphology_code = {label}")