Exemple #1
0
def main(args):
    # Read gct and config file
    (in_gct, config_io, config_metadata, _) = (
        psp_utils.read_gct_and_config_file(args.in_gct_path, args.psp_config_path))

    # Extract provenance code
    prov_code = psp_utils.extract_prov_code(
        in_gct.col_metadata_df, config_metadata["prov_code_field"],
        config_metadata["prov_code_delimiter"])

    ### MEDIAN NORMALIZE
    (out_gct, prov_code) = median_normalize(
        in_gct, args.divide_by_mad, args.ignore_subset_norm,
        config_metadata, prov_code)

    # Configure output name
    out_gct_name = configure_out_name(args.in_gct_path, args.out_name)

    # Reinsert provenance code
    out_gct.col_metadata_df = insert_prov_code(
        out_gct.col_metadata_df, prov_code,
        config_metadata["prov_code_delimiter"],
        config_metadata["prov_code_field"])

    # Write output gct
    write_output_gct(out_gct, out_gct_name, config_io["data_null"], config_io["filler_null"])
    return out_gct
Exemple #2
0
    def test_extract_prov_code(self):
        col_meta_df = pd.DataFrame.from_dict({
            "foo": ["a", "b", "c"],
            "prov_field": ["PRM+L2X", "PRM+L2X", "PRM+L2X"]
        })
        e_prov_code = ["PRM", "L2X"]

        prov_code = utils.extract_prov_code(col_meta_df, "prov_field", "+")
        self.assertEqual(e_prov_code, prov_code,
                         ("prov_code is incorrect: {}").format(prov_code))
Exemple #3
0
def read_dry_gct_and_config_file(in_gct_path, config_path, forced_assay_type):
    """ Read gct and config file.

    Uses the utility function read_gct_and_config_file from psp_utils.
    Provenance code is extracted from the col metadata. It must be non-empty
    and the same for all samples. If forced_assay_type is not None,
    assay_type is set to forced_assay_type.

    Args:
        in_gct_path (string): filepath to gct file
        config_path (string): filepath to config file
        forced_assay_type (string, or None)

    Returns:
        gct (GCToo object)
        assay_type (string)
        prov_code (list of strings)
        config_io (dictionary)
        config_metadata (dictionary)
        config_parameters (dictionary)
    """
    # Read gct and config file
    (gct, config_io, config_metadata,
     config_parameters) = psp_utils.read_gct_and_config_file(
         in_gct_path, config_path)

    # Extract the plate's provenance code
    prov_code = psp_utils.extract_prov_code(
        gct.col_metadata_df, config_metadata["prov_code_field"],
        config_metadata["prov_code_delimiter"])

    # If forced_assay_type is not None, set assay_type to forced_assay_type.
    # Otherwise, the first entry of the provenance code is the assay_type.
    if forced_assay_type is not None:
        assay_type = forced_assay_type
    else:
        assay_type = prov_code[0]

    # Make sure assay_type is one of the allowed values
    p100_assay_types = eval(config_metadata["p100_assays"])
    gcp_assay_types = eval(config_metadata["gcp_assays"])
    assay_type_out = check_assay_type(assay_type, p100_assay_types,
                                      gcp_assay_types)

    return gct, assay_type_out, prov_code, config_io, config_metadata, config_parameters
Exemple #4
0
def main(args):
    # Import gct
    gct = parse(args.gct_file_path)

    # Get plate and well names
    (plate_names,
     well_names) = extract_plate_and_well_names(gct.col_metadata_df,
                                                args.plate_field,
                                                args.well_field)

    # Extract provenance code
    prov_code = utils.extract_prov_code(gct.col_metadata_df, PROV_CODE_FIELD,
                                        PROV_CODE_DELIMITER)

    # If data has been log-transformed, undo it
    unlogged_df = undo_log_transform_if_needed(gct.data_df, prov_code)

    # Divide by the maximum value for the row
    max_row_values = unlogged_df.max(axis='columns')
    divided_df = unlogged_df.div(max_row_values, axis="rows")

    # Calculate metrics for each sample
    medium_over_heavy_medians = divided_df.median(axis=0).values
    medium_over_heavy_means = divided_df.mean(axis=0).values
    medium_over_heavy_mads = divided_df.mad(axis=0).values
    medium_over_heavy_sds = divided_df.std(axis=0).values

    # Assemble plate_names, well_names, and metrics into a dataframe
    out_df = assemble_output_df(
        plate_names, well_names, {
            "medium_over_heavy_median": medium_over_heavy_medians,
            "medium_over_heavy_mad": medium_over_heavy_mads
        })

    # Write to pw file
    out_df.to_csv(args.out_pw_file_path, sep="\t", na_rep="NaN", index=False)
    logger.info("PW file written to {}".format(args.out_pw_file_path))