def main(args): # Read gct and config file (in_gct, config_io, config_metadata, _) = ( psp_utils.read_gct_and_config_file(args.in_gct_path, args.psp_config_path)) # Extract provenance code prov_code = psp_utils.extract_prov_code( in_gct.col_metadata_df, config_metadata["prov_code_field"], config_metadata["prov_code_delimiter"]) ### MEDIAN NORMALIZE (out_gct, prov_code) = median_normalize( in_gct, args.divide_by_mad, args.ignore_subset_norm, config_metadata, prov_code) # Configure output name out_gct_name = configure_out_name(args.in_gct_path, args.out_name) # Reinsert provenance code out_gct.col_metadata_df = insert_prov_code( out_gct.col_metadata_df, prov_code, config_metadata["prov_code_delimiter"], config_metadata["prov_code_field"]) # Write output gct write_output_gct(out_gct, out_gct_name, config_io["data_null"], config_io["filler_null"]) return out_gct
def test_extract_prov_code(self): col_meta_df = pd.DataFrame.from_dict({ "foo": ["a", "b", "c"], "prov_field": ["PRM+L2X", "PRM+L2X", "PRM+L2X"] }) e_prov_code = ["PRM", "L2X"] prov_code = utils.extract_prov_code(col_meta_df, "prov_field", "+") self.assertEqual(e_prov_code, prov_code, ("prov_code is incorrect: {}").format(prov_code))
def read_dry_gct_and_config_file(in_gct_path, config_path, forced_assay_type): """ Read gct and config file. Uses the utility function read_gct_and_config_file from psp_utils. Provenance code is extracted from the col metadata. It must be non-empty and the same for all samples. If forced_assay_type is not None, assay_type is set to forced_assay_type. Args: in_gct_path (string): filepath to gct file config_path (string): filepath to config file forced_assay_type (string, or None) Returns: gct (GCToo object) assay_type (string) prov_code (list of strings) config_io (dictionary) config_metadata (dictionary) config_parameters (dictionary) """ # Read gct and config file (gct, config_io, config_metadata, config_parameters) = psp_utils.read_gct_and_config_file( in_gct_path, config_path) # Extract the plate's provenance code prov_code = psp_utils.extract_prov_code( gct.col_metadata_df, config_metadata["prov_code_field"], config_metadata["prov_code_delimiter"]) # If forced_assay_type is not None, set assay_type to forced_assay_type. # Otherwise, the first entry of the provenance code is the assay_type. if forced_assay_type is not None: assay_type = forced_assay_type else: assay_type = prov_code[0] # Make sure assay_type is one of the allowed values p100_assay_types = eval(config_metadata["p100_assays"]) gcp_assay_types = eval(config_metadata["gcp_assays"]) assay_type_out = check_assay_type(assay_type, p100_assay_types, gcp_assay_types) return gct, assay_type_out, prov_code, config_io, config_metadata, config_parameters
def main(args): # Import gct gct = parse(args.gct_file_path) # Get plate and well names (plate_names, well_names) = extract_plate_and_well_names(gct.col_metadata_df, args.plate_field, args.well_field) # Extract provenance code prov_code = utils.extract_prov_code(gct.col_metadata_df, PROV_CODE_FIELD, PROV_CODE_DELIMITER) # If data has been log-transformed, undo it unlogged_df = undo_log_transform_if_needed(gct.data_df, prov_code) # Divide by the maximum value for the row max_row_values = unlogged_df.max(axis='columns') divided_df = unlogged_df.div(max_row_values, axis="rows") # Calculate metrics for each sample medium_over_heavy_medians = divided_df.median(axis=0).values medium_over_heavy_means = divided_df.mean(axis=0).values medium_over_heavy_mads = divided_df.mad(axis=0).values medium_over_heavy_sds = divided_df.std(axis=0).values # Assemble plate_names, well_names, and metrics into a dataframe out_df = assemble_output_df( plate_names, well_names, { "medium_over_heavy_median": medium_over_heavy_medians, "medium_over_heavy_mad": medium_over_heavy_mads }) # Write to pw file out_df.to_csv(args.out_pw_file_path, sep="\t", na_rep="NaN", index=False) logger.info("PW file written to {}".format(args.out_pw_file_path))