def plot_history( logs, cols=['avg_train_loss', 'val_loss'], save_as=None, figsize=(10, 5), dpi=100, ): df = pd.DataFrame.from_records(logs) plt.figure(figsize=figsize) for col in cols: plt.plot( df.index, df[col].astype(np.float), label=col, ) plt.ylim(0, 1) plt.legend() plt.xlabel('Epoch') plt.ylabel('Loss') if save_as is None: plt.show() else: plt.savefig(save_as, dpi=dpi) debug(f"Saved as {save_as}")
def select_dicom(df, image_laterality, view_position): df = df.loc[(df['image_laterality'] == image_laterality) & (df['view_position'] == view_position)] if len(df) == 0: return np.NaN elif len(df) > 1: debug("---") df = df.sort_values(['acquisition_date', 'acquisition_time'], ascending=False) debug(f"{df[['acquisition_date', 'acquisition_time']]}") return df.index[0] else: return df.index[0]
def __init__( self, classifier_metrics, detective_metrics, display_fn=_default_display_fn, residual_factor=0.95, reset_running_avg_between_epochs=False, ): self.classifier_metrics = classifier_metrics self.detective_metrics = detective_metrics self.reset_running_avg_between_epochs = reset_running_avg_between_epochs self.residual_factor = residual_factor debug(f"residual_factor is set to be {self.residual_factor}") self.display_fn = display_fn
def view_history( logs, cols=None, save_as=None, ): df = pd.DataFrame.from_records(logs) if cols is not None: df = df[cols] if save_as is None: print(df) else: df.to_csv(save_as) debug(f"Saved as {save_as}")
def resume(path_to_checkpoint, pwd=None): with open(path_to_checkpoint, 'rb') as f: engine = pickle.load(f) engine.i_epoch += 1 engine.max_epochs += 100 old = engine.max_epochs engine.max_epochs += 100 debug(f"engine.max_epochs is increased from {old} to {engine.max_epochs}") if pwd is not None: old = engine.pwd engine.pwd = pwd debug(f"engine.pwd is changed from {old} to {engine.pwd}") engine.run()
def merge_and_process(config): output_base_dir = config['output_base_dir'] dmeta_mayo, pmeta_mayo = find_all_dicoms_for_mayo() dmeta_ucsf, pmeta_ucsf = find_all_dicoms_for_ucsf() print(f"len(dmeta_mayo) =\n{len(dmeta_mayo)}") print(f"len(dmeta_ucsf) =\n{len(dmeta_ucsf)}") print(f"len(pmeta_mayo) =\n{len(pmeta_mayo)}") print(f"len(pmeta_ucsf) =\n{len(pmeta_ucsf)}") dmeta = pd.concat([dmeta_mayo, dmeta_ucsf]) print(f"len(dmeta) =\n{len(dmeta)}") pmeta = pd.concat([pmeta_mayo, pmeta_ucsf]) print(f"len(pmeta) =\n{len(pmeta)}") # ----------------------------------------------------------------------------------------------------------- # STEP 1: dmeta # ----------------------------------------------------------------------------------------------------------- # dmeta = pd.concat( # [ # process_ucsf_dmeta('raw_data/ucsf_meta_train.csv'), # # process_ucsf_dmeta('raw_data/ucsf_meta_test.csv'), # process_mayo_dmeta('raw_data/mayo_meta_train.csv'), # # process_mayo_dmeta('raw_data/mayo_meta_test.csv'), # ] # ) dmeta = dmeta.set_index('dicom_id') # find out all rows whose DICOM files do not exist dmeta['dicom_exists'] = [os.path.isfile(p) for p in dmeta['path_to_dicom']] del_vec = ~dmeta['dicom_exists'] if sum(del_vec) > 0: warn(f"The following {sum(del_vec)} DICOMs are excluded because their files are not found in the folder:") print(dmeta.loc[del_vec]) dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_file_not_found.csv") dmeta = dmeta.loc[~del_vec] # ----------------------------------------------------------------------------------------------------------- # STEP 2: pmeta # ----------------------------------------------------------------------------------------------------------- pmeta = pmeta.set_index('patient_id') pmeta_with_labels = pd.concat( [ process_ucsf_pmeta('raw_data/ucsf_meta_train.csv').assign(source='ucsf', group='train'), process_ucsf_pmeta('raw_data/ucsf_meta_test.csv').assign(source='ucsf', group='test'), process_mayo_pmeta('raw_data/mayo_meta_train.csv').assign(source='mayo', group='train'), process_mayo_pmeta('raw_data/mayo_meta_test.csv').assign(source='mayo', group='test'), ] )[['patient_id', 'label', 'group']].set_index('patient_id') pmeta = pmeta.join(pmeta_with_labels) pmeta['label'] = pmeta['label'].astype("UInt8") pmeta['group'] = pmeta['group'].fillna('test2') pmeta['n_dicoms'] = dmeta.groupby('patient_id').size() # Delete patients who do not have at least 4 DICOMs del_vec = pmeta['n_dicoms'] < 4 if sum(del_vec) > 0: warn(f"The following {sum(del_vec)} patients are excluded because they do not have at least 4 DICOMs:") print(pmeta.loc[del_vec]) pmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/patients_view_missing_1.csv") pmeta = pmeta.loc[~del_vec] # INFO: there are 564 patients with more than 4 dicoms if config['n_samples'] is not None: pmeta = pmeta.reset_index() pmeta = pmeta.sort_values(['n_dicoms', 'patient_id']) pmeta = pmeta.set_index('patient_id') pmeta = pmeta.sample(n=config['n_samples']) patient_ids = set(pmeta.index) del_vec = ~dmeta['patient_id'].isin(patient_ids) if sum(del_vec) > 0: warn(f"The following {sum(del_vec)} DICOMs are excluded because their patients have just been excluded:") print(dmeta.loc[del_vec]) dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_exclued_because_patients_view_missing_1.csv") dmeta = dmeta.loc[~del_vec] # ----------------------------------------------------------------------------------------------------------- # STEP 3: Read each of the DICOM files, write down their key info from the # headers, and preprocess them # ----------------------------------------------------------------------------------------------------------- def process_dicoms_taskgen(): for _, x in dmeta.reset_index().iterrows(): yield {'row': dict(x), 'config': config} results = list(parallelize(process_one_dicom, process_dicoms_taskgen(), len_=len(dmeta))) dmeta = pd.DataFrame.from_records([x['row'] for x in results]) debug(dmeta) del_vec = dmeta['view_mod'] if sum(del_vec) > 0: warn( f"The following {sum(del_vec)} DICOMs are excluded because " f"they have a modified view (ViewModifierCodeSequence):" ) print(dmeta.loc[del_vec]) dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_exclued_because_view_modified.csv") dmeta = dmeta.loc[~del_vec] dmeta = dmeta[[ 'dicom_id', 'pixel_intensity_relationship', 'pixel_intensity_relationship_sign', 'window_center', 'window_width', 'patient_id', 'image_laterality', 'view_position', 'flipped', 'dicom_exists', 'height', 'width', 'view_mod', 'field_of_view_horizontal_flip', 'left_half_sum', 'right_half_sum', 'acquisition_date', 'acquisition_time', 'path_to_dicom', 'pixel_disagree_with_header', ]] dmeta = dmeta.sort_values([ 'patient_id', 'image_laterality', 'view_position', ]) dmeta = dmeta.set_index('dicom_id') dmeta.to_csv(f"{output_base_dir}/dmeta.csv") # ----------------------------------------------------------------------------------------------------------- # STEP 4: Go back to processing pmeta, now that we have much more info from # the DICOM files. # ----------------------------------------------------------------------------------------------------------- def select_dicom(df, image_laterality, view_position): df = df.loc[(df['image_laterality'] == image_laterality) & (df['view_position'] == view_position)] if len(df) == 0: return np.NaN elif len(df) > 1: debug("---") df = df.sort_values(['acquisition_date', 'acquisition_time'], ascending=False) debug(f"{df[['acquisition_date', 'acquisition_time']]}") return df.index[0] else: return df.index[0] for patient_id, dmeta_g in dmeta.groupby('patient_id'): pmeta.loc[patient_id, 'LCC'] = select_dicom(dmeta_g, 'L', 'CC') pmeta.loc[patient_id, 'RCC'] = select_dicom(dmeta_g, 'R', 'CC') pmeta.loc[patient_id, 'LMLO'] = select_dicom(dmeta_g, 'L', 'MLO') pmeta.loc[patient_id, 'RMLO'] = select_dicom(dmeta_g, 'R', 'MLO') del_vec = pmeta[['LCC', 'RCC', 'LMLO', 'RMLO']].isna().any(axis=1) if sum(del_vec) > 0: warn( f"The following {sum(del_vec)} patients are excluded because they " f"have at least one view missing (patients_view_missing.csv):" ) print(pmeta.loc[del_vec]) pmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/patients_view_missing_2.csv") pmeta = pmeta.loc[~del_vec] pmeta = pmeta.reset_index() pmeta = pmeta.sort_values(['n_dicoms', 'patient_id']) pmeta = pmeta.set_index('patient_id') pmeta['n_dicoms'] = dmeta.groupby('patient_id').size() pmeta.to_csv(f"{output_base_dir}/pmeta.csv") print(output_base_dir)