Python read_csvs_with_chunksize Examples, io_utils.read_csvs_with_chunksize Python Examples

Example #1

0

Show file

def load_single_cell_compartment_csv(compartment_dir, compartment,
                                     metadata_cols):
    """
    Load and process columns for CellProfiler output data

    Arguments:
    compartment_dir - path location of where the compartment csv files are stored
    compartment - string representing the compartment to load (e.g. cytoplasm)
    metadata_cols - a list of columns to add `Metadata_` prefix.
        Note the entries should not already be prefixed by compartment
        (e.g. AreaShape and not Cells_AreaShape)

    Output:
    A compartment dataframe with compartment prefixed column names
    """
    # Setup compartment file
    compartment = compartment.capitalize()
    compartment_file = pathlib.Path(compartment_dir, f"{compartment}.csv")

    # Load compartment data
    compartment_df = read_csvs_with_chunksize(compartment_file)
    compartment_df.columns = [
        f"{compartment}_{x}" for x in compartment_df.columns
    ]

    # Identify and rename metadata_cols
    metadata_rename = {}
    for col in metadata_cols:
        metadata_col = f"Metadata_{compartment}_{col}"
        metadata_rename[f"{compartment}_{col}"] = metadata_col

    compartment_df = compartment_df.rename(metadata_rename, axis="columns")

    return compartment_df

Example #2

0

Show file

def load_compartments(core, example_dir):
    compartments = core["compartments"]

    data = {}
    for compartment in compartments:

        compart_file = get_compartment_file(compartment, example_dir)
        df = read_csvs_with_chunksize(compart_file)
        df = recode_cols(df, core, compartment)

        data[compartment] = df

    return data

Example #3

0

Show file

File: 2.process-cells.py Project: broadinstitute/pooled-cell-painting-profiling-recipe

print("Starting 2.process-cells.")
logging.info(f"Starting 2.process-cells.")
cell_quality = CellQuality(
    quality_func, category_class_name=quality_col, category_col_index=quality_idx
)
cell_category_dict = cell_quality.define_cell_quality()
empty_cell_category = len(cell_category_dict) + 1
cell_category_dict[empty_cell_category] = "Empty"
cell_category_df = pd.DataFrame(cell_category_dict, index=[quality_col]).transpose()

# Enables feature filtering by loading the Cell Painting feature file.
# 0.prefilter-features.py must be run first
try:
    all_feature_df = read_csvs_with_chunksize(prefilter_file, sep="\t").query(
        "not prefilter_column"
    )
except FileNotFoundError:
    raise FileNotFoundError(
        "Error",
        f"{prefilter_file} not found.  ",
        "Perform 0.prefilter-features.py prefilter before continuing...",
    )

# Load image metadata summary file to extract out important metadata indicators
# 1.process-spots.py must be run first
try:
    image_df = read_csvs_with_chunksize(input_image_file, sep="\t")
except FileNotFoundError:
    raise FileNotFoundError(
        "Error",

Example #4

0

Show file

File: 3.visualize-cell-summary.py Project: broadinstitute/pooled-cell-painting-profiling-recipe

                                           split_info,
                                           separator="___")

# Read and Merge Data
cell_quality_list = []
site_stat_list = []
pert_counts_list = []
for data_split_site in site_info_dict:
    split_sites = site_info_dict[data_split_site]

    for site in split_sites:
        # Aggregates cell quality by site into single list
        cell_count_file = pathlib.Path(
            f"{input_paintdir}/{site}/cell_counts_{site}.tsv")
        cell_quality_list.append(
            read_csvs_with_chunksize(cell_count_file, sep="\t"))

        # Aggregates site summary stats into a single list
        site_stat_file = pathlib.Path(input_spotdir, site, f"site_stats.tsv")
        site_stat_list.append(
            read_csvs_with_chunksize(site_stat_file, sep="\t"))

        # Aggregates perturbation counts by site into a single list
        pert_count_file = pathlib.Path(
            input_spotdir, site,
            f"cell_perturbation_category_summary_counts.tsv")
        pert_counts_list.append(
            read_csvs_with_chunksize(pert_count_file, sep="\t"))

# Creates dataframe from cell quality list
cell_count_df = pd.concat(cell_quality_list,

Example #5

0

Show file

            ),
        )

        print(
            f"Now performing feature selection for {data_level}...with operations: {feature_select_operations} for split {data_split_site}"
        )
        logging.info(
            f"Performing feature selection for {data_level} with operations: {feature_select_operations} for split {data_split_site}"
        )

        output_file = feature_select_output_files[data_level]
        output_file = pathlib.Path(
            feature_select_output_files[data_level].parents[0],
            output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
        )
        df = read_csvs_with_chunksize(file_to_feature_select)

        feature_select(
            profiles=df,
            features=feature_select_features,
            samples=feature_select_drop_samples,
            operation=feature_select_operations,
            na_cutoff=feature_select_nacutoff,
            corr_threshold=feature_select_corr_threshold,
            output_file=output_file,
            compression_options=compression,
            float_format=float_format,
        )
print("Finished 3.feature-select.")
logging.info("Finished 3.feature-select.")

Example #6

0

Show file

                normalize_input_dir,
                file_to_normalize.name.replace(".csv.gz",
                                               f"_{data_split_site}.csv.gz"),
            )

        print(
            f"Now normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
        )
        logging.info(
            f"Normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}"
        )

        output_file = normalize_output_files[data_level]
        output_file = pathlib.Path(
            normalize_output_files[data_level].parents[0],
            output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
        )
        df = read_csvs_with_chunksize(file_to_normalize)

        normalize(
            profiles=df,
            features=normalize_these_features,
            samples=normalize_by_samples,
            method=normalize_method,
            output_file=output_file,
            compression_options=compression,
            float_format=float_format,
        )
print("Finished 2.normalize.")
logging.info("Finished 2.normalize.")

Example #7

0

Show file

    force = args.force

# Check if single cell file already exists, and warn user about no effect
if single_file_only:
    if single_file_only_output_file.exists():
        if not force:
            warnings.warn(
                "Combined single cell file exists. Use '--force' to overwrite."
            )
            logging.warning("Combined single cell file already exists.")

print("Starting 0.merge-single-cells.")
logging.info(f"Started 0.merge-single-cells.")

# Load preselected features
all_feature_df = read_csvs_with_chunksize(prefilter_file, sep="\t")

if prefilter_features:
    all_feature_df = all_feature_df.query("not prefilter_column")

# Pull out all sites that were measured
sites = [x.name for x in input_spotdir.iterdir() if x.name not in ignore_files]
site_info_dict = get_split_aware_site_info(
    config["experiment"], sites, split_info, separator="___"
)

allowed_skip_counter = 0
for data_split_site in site_info_dict:
    split_sites = site_info_dict[data_split_site]

    sc_df = []

Example #8

0

Show file

for data_split_site in site_info_dict:
    # Define a dataset specific file
    single_cell_dataset_file = pathlib.Path(
        single_cell_output_dir,
        single_cell_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"),
    )
    # Input argument flow control
    if aggregate_from_single_file:
        assert (
            single_cell_dataset_file.exists()
        ), "Error! The single cell file does not exist! Check 0.merge-single-cells.py"

    # Load single cell data
    if aggregate_from_single_file:
        print(f"Loading one single cell file: {single_cell_dataset_file}")
        single_cell_df = read_csvs_with_chunksize(single_cell_dataset_file,
                                                  sep=",")
        logging.info(
            f"Loaded one single cell file: {single_cell_dataset_file}")
    else:
        sites = site_info_dict[data_split_site]
        print(f"Now loading data from {len(sites)} sites")
        logging.info(f"Loading data from {len(sites)} sites")
        single_cell_df = []
        for site in sites:
            site_file = single_cell_site_files[site]
            if site_file.exists():
                site_df = read_csvs_with_chunksize(site_file, sep=",")
                single_cell_df.append(site_df)
                print(f"Appended {site}")
                logging.info(f"Appended {site}")
            else:

Example #9

0

Show file

File: 4.image-and-segmentation-qc.py Project: broadinstitute/pooled-cell-painting-profiling-recipe

force = plate_summary_config["force_overwrite"]
perform = plate_summary_config["perform"]

# check if this step should be performed
if not perform:
    sys.exit("Config file set to perform=False, not performing {}".format(__file__))

# Forced overwrite can be achieved in one of two ways.
# The command line overrides the config file, check here if it is provided
if not force:
    force = args.force

print("Starting 4.image-and-segmentation-qc.")
logging.info(f"Started 4.image-and-segmentation-qc.")

cell_count_df = read_csvs_with_chunksize(cell_count_file, sep="\t")

# Creates x, y coordinates for plotting per-plate views.
# Assumes image numbering starts in upper left corner and proceeds down
final_order = []
for i in range(1, sites_per_image_grid_side + 1):
    build_seq = list(
        zip(
            ([i] * (sites_per_image_grid_side + 1)),
            reversed(range(1, (sites_per_image_grid_side + 1))),
        )
    )
    final_order += build_seq

# Uses sites_list in case there are fewer analyzed sites than acquired sites
sites_list = [*range(1, (sites_per_image_grid_side * sites_per_image_grid_side) + 1)]

Example #10

0

Show file

allowed_skip_counter = 0
for data_split_site in site_info_dict:
    split_sites = site_info_dict[data_split_site]
    for site in split_sites:
        if allowed_skips >= allowed_skip_counter:
            print(
                f"Now processing spots for {site}...part of set {data_split_site}"
            )
            logging.info(
                f"Now processing spots for {site}...part of set {data_split_site}"
            )

            # Load image metadata per site
            try:
                image_file = pathlib.Path(input_batchdir, site, "Image.csv")
                image_df = read_csvs_with_chunksize(image_file).assign(
                    Metadata_site=site, Metadata_dataset_split=data_split_site)
                image_list.append(image_df)

                # Obtain specific metadata info
                well = image_df.loc[:, image_cols["well"]].squeeze()
                plate = image_df.loc[:, image_cols["plate"]].squeeze()
                site_location = image_df.loc[:, image_cols["site"]].squeeze()
            except FileNotFoundError:
                print(f"{site} image metadata does not exist. Skipping...")
                logging.info(f"Skipped {site}. No Image.csv")
                continue
            except:
                print(f"Couldn't parse {site} image metadata. Skipping...")
                logging.warning(
                    f"Couldn't parse {site} image metadata. Skipping...")
                allowed_skip_counter += 1