def _prevalent_incident_tensor_from_file( tm: TensorMap, hd5: h5py.File, dependents=None, ): index = 0 categorical_data = np.zeros(tm.shape, dtype=np.float32) if tm.hd5_key_guess() in hd5: data = tm.hd5_first_dataset_in_group(hd5, tm.hd5_key_guess()) if tm.storage_type == StorageType.CATEGORICAL_INDEX or tm.storage_type == StorageType.CATEGORICAL_FLAG: index = int(data[0]) categorical_data[index] = 1.0 else: categorical_data = np.array(data) elif tm.storage_type == StorageType.CATEGORICAL_FLAG: categorical_data[index] = 1.0 else: raise ValueError( f"No HD5 Key at prefix {tm.path_prefix} found for tensor map: {tm.name}.", ) if index != 0: if event_date_key in hd5 and start_date_key in hd5: disease_date = str2date(str(hd5[event_date_key][0])) assess_date = str2date(str(hd5[start_date_key][0])) else: raise ValueError(f"No date found for tensor map: {tm.name}.") index = 1 if disease_date < assess_date else 2 categorical_data[index] = 1.0 return categorical_data
def _handle_tm(self, tm: TensorMap, is_input: bool, path: Path) -> h5py.File: name = tm.input_name() if is_input else tm.output_name() batch = self.in_batch if is_input else self.out_batch idx = self.stats['batch_index'] if tm in self.dependents: batch[name][idx] = self.dependents[tm] if tm.cacheable: self.cache[path, name] = self.dependents[tm] self._collect_stats(tm, self.dependents[tm]) return self.hd5 if (path, name) in self.cache: batch[name][idx] = self.cache[path, name] return self.hd5 if self.hd5 is None: # Don't open hd5 if everything is in the self.cache self.hd5 = h5py.File(path, 'r') tensor = tm.postprocess_tensor(tm.tensor_from_file( tm, self.hd5, self.dependents), augment=self.augment, hd5=self.hd5) slices = tuple( slice(min(tm.static_shape()[i], tensor.shape[i])) for i in range(len(tensor.shape))) batch[name][(idx, ) + slices] = tensor[slices] if tm.cacheable: self.cache[path, name] = batch[name][idx] self._collect_stats(tm, tensor) return self.hd5
def make_test_tensor_maps(desired_map_name: str) -> TensorMap: for n in range(1, 6): if desired_map_name == f'{n}d_cont': return TensorMap(f'{n}d_cont', shape=tuple(range(2, n + 2)), interpretation=Interpretation.CONTINUOUS) if desired_map_name == f'{n}d_cat': return TensorMap( f'{n}d_cat', shape=tuple(range(2, n + 2)), interpretation=Interpretation.CATEGORICAL, channel_map={f'c_{i}': i for i in range(n + 1)}, )
def build_cardiac_surgery_tensor_maps( needed_name: str, ) -> TensorMap: outcome2column = { "sts_death": "mtopd", "sts_stroke": "cnstrokp", "sts_renal_failure": "crenfail", "sts_prolonged_ventilation": "cpvntlng", "sts_dsw_infection": "deepsterninf", "sts_reoperation": "reop", "sts_any_morbidity": "anymorbidity", "sts_long_stay": "llos", } cardiac_surgery_dict = None date_interval_lookup = None if needed_name in outcome2column: if cardiac_surgery_dict is None: cardiac_surgery_dict = build_cardiac_surgery_dict( additional_columns=[outcome2column[needed_name]], ) channel_map = _outcome_channels(needed_name) sts_tmap = TensorMap( needed_name, Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, tensor_from_file=make_cardiac_surgery_outcome_tensor_from_file( cardiac_surgery_dict, outcome2column[needed_name], ), channel_map=channel_map, validator=validator_not_all_zero, ) else: if needed_name.endswith('_sts'): base_name = needed_name.split('_sts')[0] tmap_map = build_partners_time_series_tensor_maps([base_name]) if cardiac_surgery_dict is None: cardiac_surgery_dict = build_cardiac_surgery_dict( additional_columns=[outcome2column[needed_name]], ) if date_interval_lookup is None: date_interval_lookup = build_date_interval_lookup(cardiac_surgery_dict) sts_tmap = copy.deepcopy(tmap_map[base_name]) sts_tmap.name = needed_name sts_tmap.time_series_lookup = date_interval_lookup return sts_tmap
def make_mgb_ecg_measurement_matrix_global_tensor_maps(needed_name: str): # Measurement matrix TMAPS -- indices from MUSE XML dev manual, page 49 and following measurement_matrix_global_measures = { 'pon': 1, # P-wave onset in median beat (in samples) 'poff': 2, # P-wave offset in median beat 'qon': 3, # Q-Onset in median beat 'qoff': 4, # Q-Offset in median beat 'ton': 5, # T-Onset in median beat 'toff': 6, # T-Offset in median beat 'nqrs': 7, # Number of QRS Complexes 'qrsdur': 8, # QRS Duration 'qt': 9, # QT Interval 'qtc': 10, # QT Corrected 'print': 11, # PR Interval 'vrate': 12, # Ventricular Rate 'avgrr': 13, # Average R-R Interval } for measure, measure_idx in measurement_matrix_global_measures.items(): if f'partners_ecg_measurement_matrix_{measure}' == needed_name: return TensorMap( f'partners_ecg_measurement_matrix_{measure}', interpretation=Interpretation.CONTINUOUS, shape=(None, 1), path_prefix=PARTNERS_PREFIX, loss='logcosh', time_series_limit=0, tensor_from_file=make_measurement_matrix_from_file(measure_idx), )
def make_waveform_maps(desired_map_name: str) -> TensorMap: """Creates 12 possible Tensor Maps and returns the desired one or None: partners_ecg_2500 partners_ecg_2500_exact partners_ecg_5000 partners_ecg_5000_exact partners_ecg_2500_std partners_ecg_2500_std_exact partners_ecg_5000_std partners_ecg_5000_std_exact partners_ecg_2500_raw partners_ecg_2500_raw_exact partners_ecg_5000_raw partners_ecg_5000_raw_exact default normalizes with ZeroMeanStd1 and resamples _std normalizes with Standardize mean = 0, std = 2000 _raw does not normalize _exact does not resample :param desired_map_name: The name of the TensorMap and :return: The desired TensorMap """ length_options = [2500, 5000] exact_options = [True, False] normalize_options = [ZeroMeanStd1(), Standardize(mean=0, std=2000), None] for length, exact_length, normalization in product(length_options, exact_options, normalize_options): norm = '' if isinstance(normalization, ZeroMeanStd1) else '_std' if isinstance(normalization, Standardize) else '_raw' exact = '_exact' if exact_length else '' name = f'partners_ecg_{length}{norm}{exact}' if name == desired_map_name: return TensorMap( name, shape=(None, length, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=make_voltage(exact_length), normalization=normalization, channel_map=ECG_REST_AMP_LEADS, time_series_limit=0, validator=validator_not_all_zero, )
def test_explore(self, default_arguments, tmpdir_factory): temp_dir = tmpdir_factory.mktemp('explore_tensors') default_arguments.tensors = str(temp_dir) tmaps = TMAPS_UP_TO_4D[:] tmaps.append( TensorMap(f'scalar', shape=(1, ), interpretation=Interpretation.CONTINUOUS)) explore_expected = build_hdf5s(temp_dir, tmaps, n=pytest.N_TENSORS) default_arguments.num_workers = 3 default_arguments.tensor_maps_in = tmaps explore(default_arguments) csv_path = os.path.join(default_arguments.output_folder, default_arguments.id, 'tensors_all_union.csv') explore_result = pd.read_csv(csv_path) for row in explore_result.iterrows(): row = row[1] for tm in tmaps: row_expected = explore_expected[(row['fpath'], tm)] if _should_error_detect(tm): actual = getattr(row, _continuous_explore_header(tm)) assert not np.isnan(actual) continue if tm.is_continuous(): actual = getattr(row, _continuous_explore_header(tm)) assert actual == row_expected continue if tm.is_categorical(): for channel, idx in tm.channel_map.items(): channel_val = getattr( row, _categorical_explore_header(tm, channel)) assert channel_val == row_expected[idx]
def make_lead_maps(desired_map_name: str) -> TensorMap: for lead in ECG_REST_AMP_LEADS: tensormap_name = f'lead_{lead}_len' if desired_map_name == tensormap_name: return TensorMap( tensormap_name, interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map={'_2500': 0, '_5000': 1, 'other': 2}, time_series_limit=0, validator=validator_not_all_zero, tensor_from_file=make_voltage_len_categorical_tmap(lead=lead), )
def build_tensor_maps( data_descriptions: List[DataDescription], ) -> List[TensorMap]: tmaps = [] for name, shape, storage_type in data_descriptions: tmaps.append( TensorMap( name, interpretation=STORAGE_TYPE_TO_INTERPRETATION[storage_type], shape=shape, ), ) return tmaps
def tensor_from_file(tm: TensorMap, hd5: h5py.File, dependents=None): if error: raise error if normalization: tm.normalization = {'mean': mean, 'std': std} try: return table[os.path.basename(hd5.filename).replace( '.hd5', '', )].copy() except KeyError: raise KeyError(f'User id not in file {file_name}.')
def make_mgb_ecg_measurement_matrix_lead_tensor_maps(needed_name: str): for lead, lead_idx in measurement_matrix_leads.items(): for measure, measure_idx in measurement_matrix_lead_measures.items(): if f'partners_ecg_measurement_matrix_{lead}_{measure}' == needed_name: return TensorMap( f'partners_ecg_measurement_matrix_{lead}_{measure}', interpretation=Interpretation.CONTINUOUS, shape=(None, 1), path_prefix=PARTNERS_PREFIX, loss='logcosh', time_series_limit=0, tensor_from_file=make_measurement_matrix_from_file(measure_idx, lead_idx=lead_idx), )
def __init__( self, tensor_map: TensorMap, activation: str, parents: List[TensorMap] = None, **kwargs, ): self.tensor_map = tensor_map if not self.can_apply(): return self.parents = parents self.activation = _activation_layer(activation) self.dense = Dense(units=tensor_map.shape[0], name=tensor_map.output_name(), activation=tensor_map.activation) self.units = tensor_map.annotation_units
def generate_continuous_tensor_map_from_file( file_name: str, column_name, tensor_map_name: str, normalization: bool, discretization_bounds: List[float], ) -> TensorMap: if discretization_bounds: return TensorMap( f'{tensor_map_name}', Interpretation.DISCRETIZED, channel_map={tensor_map_name: 0}, tensor_from_file=build_tensor_from_file(file_name, column_name, normalization), discretization_bounds=discretization_bounds, ) else: return TensorMap( f'{tensor_map_name}', channel_map={tensor_map_name: 0}, tensor_from_file=build_tensor_from_file(file_name, column_name, normalization), )
def make_mgb_ecg_lvh_tensormaps(needed_name: str): def ecg_lvh_from_file(tm: TensorMap, hd5: h5py.File, dependents={}): # Lead order seems constant and standard throughout, but we could eventually tensorize it from XML avl_min = 1100.0 sl_min = 3500.0 cornell_female_min = 2000.0 cornell_male_min = 2800.0 sleads = ['V1', 'V3'] rleads = ['aVL', 'V5', 'V6'] ecg_dates = _get_ecg_dates(tm, hd5) dynamic, shape = _is_dynamic_shape(tm, len(ecg_dates)) tensor = np.zeros(shape, dtype=float) for i, ecg_date in enumerate(ecg_dates): path = _make_hd5_path(tm, ecg_date, 'measurementmatrix') matrix = decompress_data(data_compressed=hd5[path][()], dtype=hd5[path].attrs['dtype']) criteria_sleads = {lead: _get_measurement_matrix_entry(matrix, measurement_matrix_lead_measures['samp'], measurement_matrix_leads[lead]) for lead in sleads} criteria_rleads = {lead: _get_measurement_matrix_entry(matrix, measurement_matrix_lead_measures['ramp'], measurement_matrix_leads[lead]) for lead in rleads} sex_path = _make_hd5_path(tm, ecg_date, 'gender') is_female = 'female' in decompress_data(data_compressed=hd5[sex_path][()], dtype=hd5[sex_path].attrs['dtype']) if 'avl_lvh' in tm.name: is_lvh = criteria_rleads['aVL'] > avl_min elif 'sokolow_lyon_lvh' in tm.name: is_lvh = criteria_sleads['V1'] + np.maximum(criteria_rleads['V5'], criteria_rleads['V6']) > sl_min elif 'cornell_lvh' in tm.name: is_lvh = criteria_rleads['aVL'] + criteria_sleads['V3'] if is_female: is_lvh = is_lvh > cornell_female_min else: is_lvh = is_lvh > cornell_male_min else: raise ValueError(f'{tm.name} criterion for LVH is not accounted for') # Following convention from categorical TMAPS, positive has cmap index 1 index = 1 if is_lvh else 0 slices = (i, index) if dynamic else (index,) tensor[slices] = 1.0 return tensor for criterion in ['avl_lvh', 'sokolow_lyon_lvh', 'cornell_lvh']: if f'partners_ecg_{criterion}' == needed_name: return TensorMap( f'partners_ecg_{criterion}', interpretation=Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, tensor_from_file=ecg_lvh_from_file, channel_map={f'no_{criterion}': 0, criterion: 1}, shape=(None, 2), time_series_limit=0, )
def generate_random_text_tensor_maps( text_file: str, window_size: int, one_hot: bool = True) -> Tuple[TensorMap, TensorMap]: name = os.path.basename(text_file).split('.')[0] text, token_dictionary = token_dictionary_and_text_from_file(text_file) shape = (window_size, len(token_dictionary)) if one_hot else (window_size, ) burn_in = TensorMap( f'next_{name}', Interpretation.LANGUAGE, shape=shape, channel_map=token_dictionary, cacheable=False, ) output_map = TensorMap( f'next_next_{name}', Interpretation.LANGUAGE, shape=(len(token_dictionary), ) if one_hot else shape, loss='categorical_crossentropy', channel_map=token_dictionary, cacheable=False, ) input_map = TensorMap( name, Interpretation.LANGUAGE, shape=shape, tensor_from_file=random_text_window_tensor(text, window_size, one_hot=one_hot), dependent_map=[burn_in, output_map], channel_map=token_dictionary, annotation_units=128, cacheable=False, ) return input_map, burn_in, output_map
def __init__( self, *, tensor_map: TensorMap, dense_layers: List[int], activation: str, dense_normalize: str, dense_regularize: str, dense_regularize_rate: float, **kwargs, ): self.tensor_map = tensor_map if not self.can_apply(): return self.fully_connected = DenseBlock( widths=dense_layers, activation=activation, normalization=dense_normalize, regularization=dense_regularize, regularization_rate=dense_regularize_rate, name=tensor_map.embed_name(), )
def make_wide_file_maps(desired_map_name: str) -> Union[TensorMap, None]: days_window = 1825 if desired_map_name == 'sex_from_wide_csv': csv_tff = tensor_from_wide(WIDE_FILE, target='sex') return TensorMap( 'sex_from_wide', Interpretation.CATEGORICAL, annotation_units=2, tensor_from_file=csv_tff, channel_map={'female': 0, 'male': 1}, ) elif desired_map_name == 'age_from_wide_csv': csv_tff = tensor_from_wide(WIDE_FILE, target='age') return TensorMap( 'age_from_wide', Interpretation.CONTINUOUS, shape=(1,), tensor_from_file=csv_tff, channel_map={'age': 0}, normalization={'mean': 63.35798891483556, 'std': 7.554638350423902}, ) elif desired_map_name == 'bmi_from_wide_csv': csv_tff = csv_tff = tensor_from_wide(WIDE_FILE, target='bmi') return TensorMap( 'bmi_from_wide', Interpretation.CONTINUOUS, shape=(1,), channel_map={'bmi': 0}, annotation_units=1, normalization={'mean': 27.3397, 'std': 4.77216}, tensor_from_file=csv_tff, ) elif desired_map_name == 'ecg_2500_from_wide_csv': tff = tensor_from_wide(WIDE_FILE, target='ecg') return TensorMap( 'ecg_rest_raw', shape=(2500, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=tff, cacheable=False, channel_map=ECG_REST_UKB_LEADS, ) elif desired_map_name == 'ecg_5000_from_wide_csv': tff = tensor_from_wide(WIDE_FILE, target='ecg') return TensorMap( 'ecg_rest_raw', shape=(5000, 12), path_prefix=PARTNERS_PREFIX, tensor_from_file=tff, cacheable=False, channel_map=ECG_REST_UKB_LEADS, ) elif desired_map_name == 'time_to_hf_wide_csv': tff = tensor_from_wide(WIDE_FILE, target='time_to_event') return TensorMap('time_to_hf', Interpretation.TIME_TO_EVENT, tensor_from_file=tff) elif desired_map_name == 'survival_curve_hf_wide_csv': tff = tensor_from_wide(WIDE_FILE, target='survival_curve') return TensorMap('survival_curve_hf', Interpretation.SURVIVAL_CURVE, tensor_from_file=tff, shape=(50,), days_window=days_window)
def make_partners_diagnosis_maps(desired_map_name: str) -> Union[TensorMap, None]: diagnosis2column = { 'atrial_fibrillation': 'first_af', 'blood_pressure_medication': 'first_bpmed', 'coronary_artery_disease': 'first_cad', 'cardiovascular_disease': 'first_cvd', 'death': 'death_date', 'diabetes_mellitus': 'first_dm', 'heart_failure': 'first_hf', 'hypertension': 'first_htn', 'left_ventricular_hypertrophy': 'first_lvh', 'myocardial_infarction': 'first_mi', 'pulmonary_artery_disease': 'first_pad', 'stroke': 'first_stroke', 'valvular_disease': 'first_valvular_disease', } for diagnosis in diagnosis2column: # Build diagnosis classification TensorMaps name = f'diagnosis_{diagnosis}' if name == desired_map_name: tensor_from_file_fxn = build_incidence_tensor_from_file(INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis]) return TensorMap(f'{name}_newest', Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map=_diagnosis_channels(diagnosis), tensor_from_file=tensor_from_file_fxn) name = f'incident_diagnosis_{diagnosis}' if name == desired_map_name: tensor_from_file_fxn = build_incidence_tensor_from_file(INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis], incidence_only=True) return TensorMap(f'{name}_newest', Interpretation.CATEGORICAL, path_prefix=PARTNERS_PREFIX, channel_map=_diagnosis_channels(diagnosis, incidence_only=True), tensor_from_file=tensor_from_file_fxn) # Build time to event TensorMaps name = f'cox_{diagnosis}' if name == desired_map_name: tff = loyalty_time_to_event(INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis]) return TensorMap(f'{name}_newest', Interpretation.TIME_TO_EVENT, path_prefix=PARTNERS_PREFIX, tensor_from_file=tff) name = f'incident_cox_{diagnosis}' if name == desired_map_name: tff = loyalty_time_to_event(INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis], incidence_only=True) return TensorMap(f'{name}_newest', Interpretation.TIME_TO_EVENT, path_prefix=PARTNERS_PREFIX, tensor_from_file=tff) # Build survival curve TensorMaps for days_window in [1825]: name = f'survival_{diagnosis}_{days_window}' if name == desired_map_name: tff = _survival_from_file(days_window, INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis]) return TensorMap(f'{name}', Interpretation.SURVIVAL_CURVE, path_prefix=PARTNERS_PREFIX, shape=(50,), days_window=days_window, tensor_from_file=tff) name = f'incident_survival_{diagnosis}' if name == desired_map_name: tff = _survival_from_file(days_window, INCIDENCE_CSV, diagnosis_column=diagnosis2column[diagnosis], incidence_only=True) return TensorMap(f'{name}', Interpretation.SURVIVAL_CURVE, path_prefix=PARTNERS_PREFIX, shape=(50,), days_window=days_window, tensor_from_file=tff)
def __init__( self, *, tensor_map: TensorMap, dense_blocks: List[int], conv_type: str, conv_width: List[int], conv_x: List[int], conv_y: List[int], conv_z: List[int], block_size: int, activation: str, conv_normalize: str, conv_regularize: str, conv_regularize_rate: float, pool_x: int, pool_y: int, pool_z: int, u_connect_parents: List[TensorMap] = None, **kwargs, ): self.tensor_map = tensor_map if not self.can_apply(): return dimension = tensor_map.axes() x_filters = _repeat_dimension(conv_width if dimension == 2 else conv_x, len(dense_blocks)) y_filters = _repeat_dimension(conv_y, len(dense_blocks)) z_filters = _repeat_dimension(conv_z, len(dense_blocks)) self.dense_conv_blocks = [ DenseConvolutional( dimension=tensor_map.axes(), conv_layer_type=conv_type, filters=filters, conv_x=[x] * block_size, conv_y=[y] * block_size, conv_z=[z] * block_size, block_size=block_size, activation=activation, normalization=conv_normalize, regularization=conv_regularize, regularization_rate=conv_regularize_rate, ) for filters, x, y, z in zip(dense_blocks, x_filters, y_filters, z_filters) ] conv_layer, _ = _conv_layer_from_kind_and_dimension( dimension, 'conv', conv_x, conv_y, conv_z) self.conv_label = conv_layer(tensor_map.shape[-1], _one_by_n_kernel(dimension), activation=tensor_map.activation, name=tensor_map.output_name()) self.upsamples = [ _upsampler(dimension, pool_x, pool_y, pool_z) for _ in range(len(dense_blocks) + 1) ] self.u_connect_parents = u_connect_parents or [] self.start_shape = _start_shape_before_pooling( num_upsamples=len(dense_blocks), output_shape=tensor_map.shape, upsample_rates=[pool_x, pool_y, pool_z], channels=dense_blocks[-1]) self.reshape = FlatToStructure(output_shape=self.start_shape, activation=activation, normalization=conv_normalize) logging.info( f'Built a decoder with: {len(self.dense_conv_blocks)} and reshape {self.start_shape}' )
def normalized_first_date(tm: TensorMap, hd5: h5py.File, dependents=None): tensor = get_tensor_at_first_date(hd5, tm.path_prefix, tm.name) if tm.axes() > 1: return pad_or_crop_array_to_shape(tm.shape, tensor) else: return tensor
from typing import Dict import h5py import numpy as np from ml4h.TensorMap import TensorMap, Interpretation def mnist_image_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray: return np.array(hd5['mnist_image']) mnist_image = TensorMap('mnist_image', shape=(28, 28, 1), tensor_from_file=mnist_image_from_hd5) def mnist_label_from_hd5(tm: TensorMap, hd5: h5py.File, dependents: Dict = {}) -> np.ndarray: one_hot = np.zeros(tm.shape, dtype=np.float32) one_hot[int(hd5['mnist_label'][0])] = 1.0 return one_hot mnist_label = TensorMap( 'mnist_label', Interpretation.CATEGORICAL, tensor_from_file=mnist_label_from_hd5,
import os import h5py import numpy as np from typing import List, Tuple, Dict from itertools import product from ml4h.defines import TENSOR_EXT from ml4h.TensorMap import TensorMap, Interpretation CONTINUOUS_TMAPS = [ TensorMap(f'{n}d_cont', shape=tuple(range(2, n + 2)), interpretation=Interpretation.CONTINUOUS) for n in range(1, 6) ] CATEGORICAL_TMAPS = [ TensorMap( f'{n}d_cat', shape=tuple(range(2, n + 2)), interpretation=Interpretation.CATEGORICAL, channel_map={f'c_{i}': i for i in range(n + 1)}, ) for n in range(1, 6) ] LANGUAGE_TMAP_1HOT_WINDOW = [ TensorMap( f'language_1hot_window', shape=(32, 26), interpretation=Interpretation.LANGUAGE, channel_map={f'c_{i}': i for i in range(26)}, ),
data = tm.hd5_first_dataset_in_group( hd5, key_prefix=f'{tm.path_prefix}/{k}', ) drinks += float(data[0]) return np.array([drinks], dtype=np.float32) return alcohol_from_file log_25781_2 = TensorMap( '25781_Total-volume-of-white-matter-hyperintensities-from-T1-and-T2FLAIR-images_2_0', loss='logcosh', path_prefix='continuous', normalization={ 'mean': 7, 'std': 8, }, tensor_from_file=preprocess_with_function(np.log), channel_map={'white-matter-hyper-intensities': 0}, ) weight_lbs_2 = TensorMap( 'weight_lbs', Interpretation.CONTINUOUS, normalization={ 'mean': 168.74, 'std': 34.1, }, loss='logcosh', channel_map={'weight_lbs': 0},
def __init__( self, tensor_map: TensorMap, ): self.tensor_map = tensor_map if not self.can_apply(): return self.dense = Dense(tensor_map.shape[-1], activation=tensor_map.activation, name=tensor_map.output_name())
def _weighted_batch(in_batch: Batch, out_batch: Batch, return_paths: bool, paths: List[Path], sample_weight: TensorMap): sample_weights = [in_batch.pop(sample_weight.input_name()).flatten() ] * len(out_batch) return (in_batch, out_batch, sample_weights, paths) if return_paths else (in_batch, out_batch, sample_weights)
from ml4h.TensorMap import TensorMap, Interpretation from ml4h.defines import StorageType from ml4h.metrics import weighted_crossentropy diploid_cm = { 'homozygous_reference': 0, 'heterozygous': 1, 'homozygous_variant': 2 } rs3829740 = TensorMap('rs3829740', Interpretation.CATEGORICAL, channel_map=diploid_cm) rs2234962 = TensorMap('rs2234962', Interpretation.CATEGORICAL, channel_map=diploid_cm) rs2042995 = TensorMap('rs2042995', Interpretation.CATEGORICAL, channel_map=diploid_cm) rs3829740_weighted = TensorMap('rs3829740', Interpretation.CATEGORICAL, channel_map=diploid_cm, loss=weighted_crossentropy([1, 1, 1.5], 'rs3829740')) rs2234962_weighted = TensorMap('rs2234962', Interpretation.CATEGORICAL, channel_map=diploid_cm, loss=weighted_crossentropy([.8, 1, 1.5], 'rs2234962')) rs2042995_weighted = TensorMap('rs2042995', Interpretation.CATEGORICAL,
if incidence_only and censor_date <= assess_date: raise ValueError(f'{tm.name} only considers incident diagnoses') tensor = np.zeros(tm.shape, dtype=np.float32) tensor[0] = has_disease tensor[1] = (censor_date - assess_date).days return tensor return _cox_tensor_from_file enroll_cad_hazard = TensorMap( 'coronary_artery_disease', Interpretation.SURVIVAL_CURVE, shape=(50, ), days_window=DAYS_IN_5_YEARS, tensor_from_file=_survival_tensor('dates/enroll_date', DAYS_IN_5_YEARS), ) enroll_hyp_hazard = TensorMap( 'hypertension', Interpretation.SURVIVAL_CURVE, shape=(50, ), days_window=DAYS_IN_5_YEARS, tensor_from_file=_survival_tensor('dates/enroll_date', DAYS_IN_5_YEARS), ) enroll_afib_hazard = TensorMap( 'atrial_fibrillation_or_flutter', Interpretation.SURVIVAL_CURVE, shape=(50, ), days_window=DAYS_IN_5_YEARS,
writer_segmented.SetFileName( os.path.join( save_path, f'{tm.name}_segmented_{ds_i}_{ds_j}_{s}.vtp', ), ) writer_segmented.Update() return tensor return mri_projected_segmentation cine_segmented_lax_2ch_proj_from_sax = TensorMap( 'cine_segmented_lax_2ch_proj_from_sax', Interpretation.CONTINUOUS, shape=(256, 256, 50), loss='logcosh', tensor_from_file=_make_mri_projected_segmentation_from_file( 'cine_segmented_lax_2ch', MRI_SEGMENTED, ), ) cine_segmented_lax_3ch_proj_from_sax = TensorMap( 'cine_segmented_lax_3ch_proj_from_sax', Interpretation.CONTINUOUS, shape=(256, 256, 50), loss='logcosh', tensor_from_file=_make_mri_projected_segmentation_from_file( 'cine_segmented_lax_3ch', MRI_SEGMENTED, ), ) cine_segmented_lax_4ch_proj_from_sax = TensorMap(
def reshape_resting_ecg_to_tidy( sample_id: Union[int, str], folder: Optional[str] = None, tmap: TensorMap = DEFAULT_RESTING_ECG_SIGNAL_TMAP, ) -> pd.DataFrame: """Wrangle resting ECG data to tidy. Args: sample_id: The id of the ECG sample to retrieve. folder: The local or Cloud Storage folder under which the files reside. tmap: The TensorMap to use for ECG input. Returns: A pandas dataframe in tidy format or print a notebook-friendly error and return an empty dataframe. """ if folder is None: folder = get_resting_ecg_hd5_folder(sample_id) data: Dict[str, Any] = {'lead': [], 'raw': [], 'ts_reference': [], 'filtered': [], 'filtered_1': [], 'filtered_2': []} with tempfile.TemporaryDirectory() as tmpdirname: sample_hd5 = str(sample_id) + '.hd5' local_path = os.path.join(tmpdirname, sample_hd5) try: tf.io.gfile.copy(src=os.path.join(folder, sample_hd5), dst=local_path) except (tf.errors.NotFoundError, tf.errors.PermissionDeniedError) as e: print(f'''Warning: Resting ECG not available for sample {sample_id} in folder {folder}. Use the folder parameter to read HD5s from a different directory or bucket.\n\n{e.message}''') return pd.DataFrame(data) with h5py.File(local_path, mode='r') as hd5: try: signals = tmap.tensor_from_file(tmap, hd5) except (KeyError, ValueError) as e: print(f'''Warning: Resting ECG TMAP {tmap.name} not available for sample {sample_id}. Use the tmap parameter to choose a different TMAP.\n\n{e}''') _examine_available_keys(hd5) return pd.DataFrame(data) for (lead, channel) in ECG_REST_LEADS.items(): signal = signals[:, channel] signal_length = len(signal) data['raw'].extend(signal) data['lead'].extend([lead] * signal_length) data['ts_reference'].extend(np.array([i*1./(SAMPLING_RATE+1.) for i in range(0, signal_length)])) filtered, _, _ = filter_signal( signal=signal, ftype='FIR', band='bandpass', order=int(0.3 * SAMPLING_RATE), frequency=[.9, 50], sampling_rate=SAMPLING_RATE, ) data['filtered'].extend(filtered) filtered_1, _, _ = filter_signal( signal=signal, ftype='FIR', band='bandpass', order=int(0.3 * SAMPLING_RATE), frequency=[.9, 20], sampling_rate=SAMPLING_RATE, ) data['filtered_1'].extend(filtered_1) filtered_2, _, _ = filter_signal( signal=signal, ftype='FIR', band='bandpass', order=int(0.3 * SAMPLING_RATE), frequency=[.9, 30], sampling_rate=SAMPLING_RATE, ) data['filtered_2'].extend(filtered_2) signal_df = pd.DataFrame(data) # Convert the raw signal to mV. signal_df['raw_mV'] = signal_df['raw'] * RAW_SCALE signal_df['filtered_mV'] = signal_df['filtered'] * RAW_SCALE signal_df['filtered_1_mV'] = signal_df['filtered_1'] * RAW_SCALE signal_df['filtered_2_mV'] = signal_df['filtered_2'] * RAW_SCALE # Reshape to tidy (long format). tidy_signal_df = signal_df.melt( id_vars=['lead', 'ts_reference'], value_vars=['raw_mV', 'filtered_mV', 'filtered_1_mV', 'filtered_2_mV'], var_name='filtering', value_name='signal_mV', ) # The leads have a meaningful order, apply the order to this column. lead_factor_type = pd.api.types.CategoricalDtype( categories=[ 'strip_I', 'strip_aVR', 'strip_V1', 'strip_V4', 'strip_II', 'strip_aVL', 'strip_V2', 'strip_V5', 'strip_III', 'strip_aVF', 'strip_V3', 'strip_V6', ], ordered=True, ) tidy_signal_df['lead'] = tidy_signal_df.lead.astype(lead_factor_type) return tidy_signal_df