Esempio n. 1
0
 def test_get_paths_overlap(
     self,
     default_arguments: argparse.Namespace,
     train_valid_test_csv: DATA_SPLITS,
 ):
     args = default_arguments
     (
         (train_csv, train_ids),
         (valid_csv, valid_ids),
         (test_csv, test_ids),
     ) = train_valid_test_csv
     with pytest.raises(
         ValueError,
         match=(
             r"(train|validation|test) and (train|validation|test) samples overlap"
         ),
     ):
         train, valid, test = get_train_valid_test_ids(
             tensors=args.tensors,
             valid_ratio=args.valid_ratio,
             test_ratio=args.test_ratio,
             patient_csv=None,
             train_csv=train_csv,
             valid_csv=valid_csv,
             test_csv=test_csv,
         )
Esempio n. 2
0
def _tensors_to_df_with_dataset(
    tensors: Union[str, List[Union[str, Tuple[str, str]]]],
    patient_csv: str,
    tensor_maps_in: List[TensorMap],
    num_workers: int,
    batch_size: int,
    mrn_column_name: Optional[str] = None,
):
    patient_ids, _, _ = get_train_valid_test_ids(
        tensors=tensors,
        mrn_column_name=mrn_column_name,
        patient_csv=patient_csv,
        valid_ratio=0,
        test_ratio=0,
        allow_empty_split=True,
    )
    hd5_sources, csv_sources = tensors_to_sources(tensors, tensor_maps_in)
    dataset, _, cleanup = make_dataset(
        data_split="explore",
        hd5_sources=hd5_sources,
        csv_sources=csv_sources,
        patient_ids=patient_ids,
        input_tmaps=tensor_maps_in,
        output_tmaps=[],
        batch_size=batch_size,
        num_workers=num_workers,
        cache=False,
        augment=False,
        validate=True,
        normalize=True,
        keep_ids=True,
        verbose=False,
        return_nan=True,
    )

    data, _, patient_ids = get_dicts_of_arrays_from_dataset(dataset)
    logging.info(
        f"Extracted {len(data[tensor_maps_in[0].input_name])} tensors")
    cleanup()

    df = pd.DataFrame()
    df["patientid"] = patient_ids
    for tm in tensor_maps_in:
        tensor = data[tm.input_name]
        if tm.is_language:
            tensor = tensor.astype(str)
        if tm.channel_map is not None:
            for cm, idx in tm.channel_map.items():
                df[f"{tm.name}_{cm}"] = tensor[:, idx]
        else:
            df[tm.name] = tensor[:, 0]
    logging.info("Reorganized tensors into dataframe")

    # Dataset should return tensors for a single patient in order, however tensors for
    # many patients may be interleaved. Stable sort by patientid groups tensors for
    # patients together and preserves order returned.
    df = df.sort_values("patientid", kind="mergesort")
    logging.info("Sorted tensors by patient ID")

    return df
Esempio n. 3
0
def train_valid_test_ids(
    default_arguments: argparse.Namespace,
    train_valid_test_csv: DATA_SPLITS,
) -> ID_SPLITS:
    args = default_arguments
    (
        (train_csv, train_ids),
        (valid_csv, valid_ids),
        (test_csv, test_ids),
    ) = train_valid_test_csv
    return get_train_valid_test_ids(
        tensors=args.tensors,
        valid_ratio=args.valid_ratio,
        test_ratio=args.test_ratio,
        patient_csv=None,
        train_csv=train_csv,
        valid_csv=valid_csv,
        test_csv=test_csv,
    )
Esempio n. 4
0
    def test_get_ids(
        self,
        default_arguments: argparse.Namespace,
        sample_set: Optional[DATA_SPLIT],
        train_set: Optional[DATA_SPLIT],
        valid_set: Optional[DATA_SPLIT],
        test_set: Optional[DATA_SPLIT],
    ):
        args = default_arguments

        def _ids_equal_samples(all_ids: Set[int], samples: Set[int]):
            assert len(all_ids) == len(samples)
            assert len(all_ids - samples) == 0
            return True

        patient_csv, patient_ids = sample_set or (None, None)
        train_csv, train_ids = train_set or (None, None)
        valid_csv, valid_ids = valid_set or (None, None)
        test_csv, test_ids = test_set or (None, None)
        train, valid, test = get_train_valid_test_ids(
            tensors=args.tensors,
            patient_csv=patient_csv,
            valid_ratio=args.valid_ratio,
            test_ratio=args.test_ratio,
            train_csv=train_csv,
            valid_csv=valid_csv,
            test_csv=test_csv,
        )

        # make sure paths are disjoint and unique
        all_ids = train | valid | test
        counts = defaultdict(int)
        for patient_id in all_ids:
            counts[patient_id] += 1
        assert all(count == 1 for count in counts.values())

        # if sample csv was not given, find the files, just like how tensor_generator does
        if patient_ids is None:
            patient_ids = set()
            for root, dirs, files in os.walk(default_arguments.tensors):
                for name in files:
                    if os.path.splitext(name)[-1].lower() != TENSOR_EXT:
                        continue
                    patient_ids.add(int(os.path.splitext(name)[0]))

        if train_ids is not None:
            # this block handles the cases where samples are discarded, which happens if train_csv is supplied
            assert len(all_ids) <= len(patient_ids)
            assert all(patient_id in patient_ids for patient_id in all_ids)
        else:
            assert _ids_equal_samples(all_ids, patient_ids)

        if train_ids is not None:
            train_ids &= patient_ids
            assert _ids_equal_samples(train, train_ids)

        if valid_ids is not None:
            valid_ids &= patient_ids
            assert _ids_equal_samples(valid, valid_ids)

        if test_ids is not None:
            test_ids &= patient_ids
            assert _ids_equal_samples(test, test_ids)
Esempio n. 5
0
def _tensors_to_df(
    tensor_maps_in: List[TensorMap],
    tensors: str,
    num_workers: int,
    patient_csv: str = None,
    mrn_column_name: Optional[str] = None,
    valid_ratio: float = None,
    test_ratio: float = None,
    train_csv: str = None,
    valid_csv: str = None,
    test_csv: str = None,
    output_folder: str = "",
    export_error: bool = False,
    export_fpath: bool = False,
    export_generator: bool = False,
) -> pd.DataFrame:
    """
    Create generators, load TMaps, call run method of class that parses tensors from
    HD5 files using TMaps and saves temporary CSVs, set dtypes, consolidate CSVs into
    single dataframe, and return dataframe.
    """
    logging.info("Building generators for specified tensors")
    train_ids, valid_ids, test_ids = get_train_valid_test_ids(
        tensors=tensors,
        patient_csv=patient_csv,
        mrn_column_name=mrn_column_name,
        valid_ratio=valid_ratio,
        test_ratio=test_ratio,
        train_csv=train_csv,
        valid_csv=valid_csv,
        test_csv=test_csv,
        allow_empty_split=True,
    )

    train_paths = [
        os.path.join(tensors, f"{patient_id}.hd5") for patient_id in train_ids
    ]
    valid_paths = [
        os.path.join(tensors, f"{patient_id}.hd5") for patient_id in valid_ids
    ]
    test_paths = [
        os.path.join(tensors, f"{patient_id}.hd5") for patient_id in test_ids
    ]
    paths: List[Tuple[str, str]] = []
    paths.extend(zip(train_paths, ["train"] * len(train_paths)))
    paths.extend(zip(valid_paths, ["valid"] * len(valid_paths)))
    paths.extend(zip(test_paths, ["test"] * len(test_paths)))
    tmaps = tensor_maps_in

    TensorsToDataFrameParallelWrapper(
        tmaps=tmaps,
        paths=paths,
        num_workers=num_workers,
        output_folder=output_folder,
        export_error=export_error,
        export_fpath=export_fpath,
        export_generator=export_generator,
    ).run()

    # Get columns that should have dtype 'string' instead of dtype 'O'
    str_cols_list: List[str] = []
    if export_fpath:
        str_cols_list.extend("fpath")
    if export_generator:
        str_cols_list.extend("generator")
    for tm in tmaps:
        if tm.interpretation == Interpretation.LANGUAGE:
            str_cols_list.extend(
                [f"{tm.name}_{cm}"
                 for cm in tm.channel_map] if tm.channel_map else [tm.name], )
        str_cols_list.append(f"error_type_{tm.name}")
    str_cols = {key: "string" for key in str_cols_list}

    # Consolidate temporary CSV files into one dataframe
    temp_files = []
    df_list = []
    for name in os.listdir(output_folder):
        if "tensors_all_union_" in name:
            fpath = os.path.join(output_folder, name)
            _df = pd.read_csv(fpath, dtype=str_cols)
            logging.debug(f"Loaded {fpath} into memory")
            df_list.append(_df)
            logging.debug(f"Appended {fpath} to list of dataframes")
            temp_files.append(fpath)
    df = pd.concat(df_list, ignore_index=True)

    logging.info(
        f"{len(df)} samples extracted from {len(paths)} hd5 files using {len(tmaps)}"
        " tmaps, and consolidated to one DataFrame", )

    # Delete temporary files
    for fpath in temp_files:
        os.remove(fpath)
    logging.debug(f"Deleted {len(temp_files)} temporary files")

    return df