df = defaultdict(lambda: defaultdict(dict))
    data_origin = args.data_origin

    for data_origin in BASE_ORIGINS:
        dh = DataHandler(data_origin)
        feature_names = dh.load_feature_names()
        train_data, test_data, val_data = dh.load_data_splits()
        y_name = dh.load_target_name()
        ood_mappings = dh.load_ood_mappings()
        rel_sizes = {}
        percentage_sigs = {}

        if data_origin == "MIMIC":

            train_ood, test_ood, val_ood = dh.load_newborns()
            all_ood = pd.concat([train_ood, test_ood, val_ood])

            df[data_origin]["Newborn"]["Count"] = len(all_ood)
            df[data_origin]["Newborn"]["Mortality rate"] = round(all_ood["y"].mean(), 3)

            rel_sizes["Newborn"] = len(all_ood) / (
                len(train_data) + len(test_data) + len(val_data)
            )
            percentage_sigs["Newborn"] = ood_utils.validate_ood_data(
                train_data[feature_names].values,
                all_ood[feature_names].values,
                verbose=False,
            )[1]

        for ood_name, (column_name, ood_value) in ood_mappings:
Ejemplo n.º 2
0
    parser.add_argument(
        "--data-origin",
        type=str,
        default="MIMIC_with_indicators",
        help="Which data to use",
    )
    args = parser.parse_args()

    # Loading the data
    dh = DataHandler(args.data_origin)
    feature_names = dh.load_feature_names()
    train_data, test_data, val_data = dh.load_data_splits()
    y_name = dh.load_target_name()

    if args.data_origin in MIMIC_ORIGINS:
        train_newborns, test_newborns, val_newborns = dh.load_newborns()
        all_newborns = pd.concat([train_newborns, val_newborns, test_newborns])

    ood_mappings = dh.load_ood_mappings()

    validation_results = pd.DataFrame(
        columns=["group", "raw", "imputed", "#feats ood", "#feats same"])
    validation_results.set_index("group")

    def _add_group_results(name: str, results: pd.DataFrame,
                           group_results: Dict[str, float]) -> pd.DataFrame:
        group_results["group"] = name if len(name) < 32 else name[:32] + "..."
        results = results.append(group_results, ignore_index=True)

        return results