Exemple #1
0
def plot_history(
        logs,
        cols=['avg_train_loss', 'val_loss'],
        save_as=None,
        figsize=(10, 5),
        dpi=100,
):
    df = pd.DataFrame.from_records(logs)

    plt.figure(figsize=figsize)

    for col in cols:
        plt.plot(
            df.index,
            df[col].astype(np.float),
            label=col,
        )

    plt.ylim(0, 1)
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    if save_as is None:
        plt.show()
    else:
        plt.savefig(save_as, dpi=dpi)
        debug(f"Saved as {save_as}")
Exemple #2
0
 def select_dicom(df, image_laterality, view_position):
     df = df.loc[(df['image_laterality'] == image_laterality) & (df['view_position'] == view_position)]
     if len(df) == 0:
         return np.NaN
     elif len(df) > 1:
         debug("---")
         df = df.sort_values(['acquisition_date', 'acquisition_time'], ascending=False)
         debug(f"{df[['acquisition_date', 'acquisition_time']]}")
         return df.index[0]
     else:
         return df.index[0]
Exemple #3
0
 def __init__(
     self,
     classifier_metrics,
     detective_metrics,
     display_fn=_default_display_fn,
     residual_factor=0.95,
     reset_running_avg_between_epochs=False,
 ):
     self.classifier_metrics = classifier_metrics
     self.detective_metrics = detective_metrics
     self.reset_running_avg_between_epochs = reset_running_avg_between_epochs
     self.residual_factor = residual_factor
     debug(f"residual_factor is set to be {self.residual_factor}")
     self.display_fn = display_fn
Exemple #4
0
def view_history(
    logs,
    cols=None,
    save_as=None,
):
    df = pd.DataFrame.from_records(logs)

    if cols is not None:
        df = df[cols]

    if save_as is None:
        print(df)
    else:
        df.to_csv(save_as)
        debug(f"Saved as {save_as}")
Exemple #5
0
def resume(path_to_checkpoint, pwd=None):
    with open(path_to_checkpoint, 'rb') as f:
        engine = pickle.load(f)

    engine.i_epoch += 1

    engine.max_epochs += 100
    old = engine.max_epochs
    engine.max_epochs += 100
    debug(f"engine.max_epochs is increased from {old} to {engine.max_epochs}")

    if pwd is not None:
        old = engine.pwd
        engine.pwd = pwd
        debug(f"engine.pwd is changed from {old} to {engine.pwd}")

    engine.run()
Exemple #6
0
def merge_and_process(config):
    output_base_dir = config['output_base_dir']

    dmeta_mayo, pmeta_mayo = find_all_dicoms_for_mayo()
    dmeta_ucsf, pmeta_ucsf = find_all_dicoms_for_ucsf()
    print(f"len(dmeta_mayo) =\n{len(dmeta_mayo)}")
    print(f"len(dmeta_ucsf) =\n{len(dmeta_ucsf)}")
    print(f"len(pmeta_mayo) =\n{len(pmeta_mayo)}")
    print(f"len(pmeta_ucsf) =\n{len(pmeta_ucsf)}")

    dmeta = pd.concat([dmeta_mayo, dmeta_ucsf])
    print(f"len(dmeta) =\n{len(dmeta)}")

    pmeta = pd.concat([pmeta_mayo, pmeta_ucsf])
    print(f"len(pmeta) =\n{len(pmeta)}")

    # -----------------------------------------------------------------------------------------------------------
    # STEP 1: dmeta
    # -----------------------------------------------------------------------------------------------------------

    # dmeta = pd.concat(
    #     [
    #         process_ucsf_dmeta('raw_data/ucsf_meta_train.csv'),
    #         # process_ucsf_dmeta('raw_data/ucsf_meta_test.csv'),
    #         process_mayo_dmeta('raw_data/mayo_meta_train.csv'),
    #         # process_mayo_dmeta('raw_data/mayo_meta_test.csv'),
    #     ]
    # )
    dmeta = dmeta.set_index('dicom_id')

    # find out all rows whose DICOM files do not exist
    dmeta['dicom_exists'] = [os.path.isfile(p) for p in dmeta['path_to_dicom']]
    del_vec = ~dmeta['dicom_exists']
    if sum(del_vec) > 0:
        warn(f"The following {sum(del_vec)} DICOMs are excluded because their files are not found in the folder:")
        print(dmeta.loc[del_vec])
        dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_file_not_found.csv")
    dmeta = dmeta.loc[~del_vec]

    # -----------------------------------------------------------------------------------------------------------
    # STEP 2: pmeta
    # -----------------------------------------------------------------------------------------------------------

    pmeta = pmeta.set_index('patient_id')

    pmeta_with_labels = pd.concat(
        [
            process_ucsf_pmeta('raw_data/ucsf_meta_train.csv').assign(source='ucsf', group='train'),
            process_ucsf_pmeta('raw_data/ucsf_meta_test.csv').assign(source='ucsf', group='test'),
            process_mayo_pmeta('raw_data/mayo_meta_train.csv').assign(source='mayo', group='train'),
            process_mayo_pmeta('raw_data/mayo_meta_test.csv').assign(source='mayo', group='test'),
        ]
    )[['patient_id', 'label', 'group']].set_index('patient_id')

    pmeta = pmeta.join(pmeta_with_labels)
    pmeta['label'] = pmeta['label'].astype("UInt8")
    pmeta['group'] = pmeta['group'].fillna('test2')

    pmeta['n_dicoms'] = dmeta.groupby('patient_id').size()
    # Delete patients who do not have at least 4 DICOMs
    del_vec = pmeta['n_dicoms'] < 4
    if sum(del_vec) > 0:
        warn(f"The following {sum(del_vec)} patients are excluded because they do not have at least 4 DICOMs:")
        print(pmeta.loc[del_vec])
        pmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/patients_view_missing_1.csv")
    pmeta = pmeta.loc[~del_vec]
    # INFO: there are 564 patients with more than 4 dicoms

    if config['n_samples'] is not None:
        pmeta = pmeta.reset_index()
        pmeta = pmeta.sort_values(['n_dicoms', 'patient_id'])
        pmeta = pmeta.set_index('patient_id')
        pmeta = pmeta.sample(n=config['n_samples'])

    patient_ids = set(pmeta.index)
    del_vec = ~dmeta['patient_id'].isin(patient_ids)
    if sum(del_vec) > 0:
        warn(f"The following {sum(del_vec)} DICOMs are excluded because their patients have just been excluded:")
        print(dmeta.loc[del_vec])
        dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_exclued_because_patients_view_missing_1.csv")
    dmeta = dmeta.loc[~del_vec]

    # -----------------------------------------------------------------------------------------------------------
    # STEP 3: Read each of the DICOM files, write down their key info from the
    #         headers, and preprocess them
    # -----------------------------------------------------------------------------------------------------------

    def process_dicoms_taskgen():
        for _, x in dmeta.reset_index().iterrows():
            yield {'row': dict(x), 'config': config}

    results = list(parallelize(process_one_dicom, process_dicoms_taskgen(), len_=len(dmeta)))
    dmeta = pd.DataFrame.from_records([x['row'] for x in results])
    debug(dmeta)

    del_vec = dmeta['view_mod']
    if sum(del_vec) > 0:
        warn(
            f"The following {sum(del_vec)} DICOMs are excluded because "
            f"they have a modified view (ViewModifierCodeSequence):"
        )
        print(dmeta.loc[del_vec])
        dmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/dicoms_exclued_because_view_modified.csv")
    dmeta = dmeta.loc[~del_vec]
    dmeta = dmeta[[
        'dicom_id',
        'pixel_intensity_relationship',
        'pixel_intensity_relationship_sign',
        'window_center',
        'window_width',
        'patient_id',
        'image_laterality',
        'view_position',
        'flipped',
        'dicom_exists',
        'height',
        'width',
        'view_mod',
        'field_of_view_horizontal_flip',
        'left_half_sum',
        'right_half_sum',
        'acquisition_date',
        'acquisition_time',
        'path_to_dicom',
        'pixel_disagree_with_header',
    ]]
    dmeta = dmeta.sort_values([
        'patient_id',
        'image_laterality',
        'view_position',
    ])
    dmeta = dmeta.set_index('dicom_id')
    dmeta.to_csv(f"{output_base_dir}/dmeta.csv")

    # -----------------------------------------------------------------------------------------------------------
    # STEP 4: Go back to processing pmeta, now that we have much more info from
    #         the DICOM files.
    # -----------------------------------------------------------------------------------------------------------

    def select_dicom(df, image_laterality, view_position):
        df = df.loc[(df['image_laterality'] == image_laterality) & (df['view_position'] == view_position)]
        if len(df) == 0:
            return np.NaN
        elif len(df) > 1:
            debug("---")
            df = df.sort_values(['acquisition_date', 'acquisition_time'], ascending=False)
            debug(f"{df[['acquisition_date', 'acquisition_time']]}")
            return df.index[0]
        else:
            return df.index[0]

    for patient_id, dmeta_g in dmeta.groupby('patient_id'):
        pmeta.loc[patient_id, 'LCC'] = select_dicom(dmeta_g, 'L', 'CC')
        pmeta.loc[patient_id, 'RCC'] = select_dicom(dmeta_g, 'R', 'CC')
        pmeta.loc[patient_id, 'LMLO'] = select_dicom(dmeta_g, 'L', 'MLO')
        pmeta.loc[patient_id, 'RMLO'] = select_dicom(dmeta_g, 'R', 'MLO')

    del_vec = pmeta[['LCC', 'RCC', 'LMLO', 'RMLO']].isna().any(axis=1)
    if sum(del_vec) > 0:
        warn(
            f"The following {sum(del_vec)} patients are excluded because they "
            f"have at least one view missing (patients_view_missing.csv):"
        )
        print(pmeta.loc[del_vec])
        pmeta.loc[del_vec].to_csv(f"{output_base_dir}/excluded/patients_view_missing_2.csv")

    pmeta = pmeta.loc[~del_vec]
    pmeta = pmeta.reset_index()
    pmeta = pmeta.sort_values(['n_dicoms', 'patient_id'])
    pmeta = pmeta.set_index('patient_id')
    pmeta['n_dicoms'] = dmeta.groupby('patient_id').size()
    pmeta.to_csv(f"{output_base_dir}/pmeta.csv")

    print(output_base_dir)