def main():
    """
    Fit a multivariate Gaussian to the data. Get the distribution of positive cases.
    Ouput:
    1. csv file. Probability for each positive sample.
    2. bin file. The pickled data of the Gaussian model.
    :return:
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--load-data-matrix-bin', type=str)
    parser.add_argument('--positive-list', type=str)
    parser.add_argument('--out-csv-cancer', type=str)
    parser.add_argument('--out-csv-all', type=str)
    parser.add_argument('--out-csv-non-cancer', type=str)
    # parser.add_argument('--out-png', type=str)
    parser.add_argument('--num-pc', type=int)
    args = parser.parse_args()

    fit_obj = FitGaussian()
    fit_obj.load_data(args.load_data_matrix_bin, args.num_pc)
    fit_obj.get_distribution(read_file_contents_list(args.positive_list),
                             args.out_csv_cancer)
    fit_obj.get_distribution_all(args.out_csv_all)
    fit_obj.get_distribution_non_cancer(
        args.out_csv_non_cancer, read_file_contents_list(args.positive_list))
Beispiel #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--neg-sample-list', type=str)
    parser.add_argument('--pos-sample-list', type=str)
    parser.add_argument('--out-file-list-folder', type=str)
    parser.add_argument('--n-fold', type=int, default=5)
    args = parser.parse_args()

    n_fold = KFold(n_splits=args.n_fold)

    neg_sample_list = read_file_contents_list(args.neg_sample_list)
    pos_sample_list = read_file_contents_list(args.pos_sample_list)

    n_fold_file_name_list = []

    for neg_train_idx, neg_test_idx in n_fold.split(neg_sample_list):
        neg_train_file_name_list = [
            neg_sample_list[idx_file_name] for idx_file_name in neg_train_idx
        ]
        n_fold_file_name_list.append(neg_train_file_name_list)

    idx_fold = 0
    for pos_train_idx, pos_test_idx in n_fold.split(pos_sample_list):
        pos_train_file_name_list = [
            pos_sample_list[idx_file_name] for idx_file_name in pos_train_idx
        ]
        train_file_name_list = n_fold_file_name_list[
            idx_fold] + pos_train_file_name_list
        n_fold_file_name_list[idx_fold] = train_file_name_list
        idx_fold += 1

    for idx_fold in range(args.n_fold):
        out_file_list_txt = os.path.join(args.out_file_list_folder,
                                         f'pca_fold_{idx_fold}.txt')
        write_list_to_file(n_fold_file_name_list[idx_fold], out_file_list_txt)
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser('Plot box and scatter data.')
    parser.add_argument('--file-list-total', type=str)
    parser.add_argument('--file-list-out', type=str)
    parser.add_argument('--num-file-select', type=int)

    args = parser.parse_args()

    file_list_total = read_file_contents_list(args.file_list_total)

    subject_list_total, subject_list_unique = get_subject_id_list(
        file_list_total)
    logger.info(f'num of total files {len(subject_list_total)}')
    logger.info(f'num of unique files {len(unique(subject_list_total))}')
    logger.info(f'num of unique subject {len(subject_list_unique)}')

    # selected_subject_list = random.choices(subject_list_unique, k=args.num_file_select)
    selected_subject_list = random.sample(subject_list_unique,
                                          args.num_file_select)
    logger.info(f'num of selected subjects {len(selected_subject_list)}')
    logger.info(
        f'num of unique selected subjects {len(unique(selected_subject_list))}'
    )

    file_list_out = [
        file_list_total[subject_list_total.index(subject_id)]
        for subject_id in selected_subject_list
    ]

    save_file_contents_list(args.file_list_out, file_list_out)
def get_longitudinal_info_in_raw_label_data():
    file_name_list = read_file_contents_list(valid_bmi_file_list)
    file_name_list = [f'{file_name}.nii.gz' for file_name in file_name_list]
    print(file_name_list[-10:])
    subject_id_list = [
        ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name)
        for file_name in file_name_list
    ]
    subject_id_spore_format_list = [
        f'SPORE_{subj_id:08d}' for subj_id in subject_id_list
    ]

    subject_id_spore_format_list = list(set(subject_id_spore_format_list))

    # 1. Get the number of subject that have longitudinal in raw label file.
    # 2. Get the number of sessions that have ...

    raw_label_df = pd.read_excel(in_raw_label_file_xlsx)
    long_subj_data_list = {}
    # print(raw_label_df['SPORE'].to_list()[:10])
    for subj_id_spore_format in subject_id_spore_format_list:
        subj_df = raw_label_df[raw_label_df['SPORE'] == subj_id_spore_format]
        if len(subj_df) > 1:
            height_array = subj_df['heightinches'].to_numpy()
            weight_array = subj_df['weightpounds'].to_numpy()
            bmi_array = 703 * (weight_array / np.power(height_array, 2))
            long_subj_data_list[subj_id_spore_format] = {
                'heightinches': height_array,
                'weightpounds': weight_array,
                'bmi': bmi_array
            }

    print(f'Number of longitudinal subjects: {len(long_subj_data_list)}')

    return long_subj_data_list
def main():
    parser = argparse.ArgumentParser('Plot box and scatter data.')
    parser.add_argument('--file-list-total', type=str)
    parser.add_argument('--subject-id-exclude-file-list', type=str)
    parser.add_argument('--file-list-out', type=str)
    args = parser.parse_args()

    file_list_total = read_file_contents_list(args.file_list_total)
    subject_id_exclude_file_list = read_file_contents_list(
        args.subject_id_exclude_file_list)

    subject_id_exclude_list = get_subject_id_list(subject_id_exclude_file_list)

    file_list_reduced = [
        file_name for file_name in file_list_total
        if ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name)
        not in subject_id_exclude_list
    ]

    save_file_contents_list(args.file_list_out, file_list_reduced)
def get_csv(args):
    file_list = read_file_contents_list(args.file_list_txt)
    in_ori_folder_obj = DataFolder(args.in_ori_folder, file_list)
    in_mask_folder_obj = DataFolder(args.in_mask_folder, file_list)

    exe_obj = MeanIntensityMask(in_ori_folder_obj, in_mask_folder_obj, [2, 4],
                                20)
    result_dict_list = exe_obj.run_parallel()

    result_df = pd.DataFrame(result_dict_list)
    result_df = result_df.set_index('file_name')

    print(f'Output csv to {args.out_csv}')
    result_df.to_csv(args.out_csv)
def filtering(total_sess_list):
    # print(f'# Total inconsistency sess: {len(total_sess_list)}')

    # print(f'How many cases left:')
    file_name_list = read_file_contents_list(valid_bmi_file_list)
    file_name_no_ext = [
        file_name.replace('.nii.gz', '') for file_name in file_name_list
    ]
    subj_all_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(
        file_name_no_ext)
    # all_subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(file_name_no_ext)
    #
    # long_sess_list = ClinicalDataReaderSPORE._get_longitudinal_sess_list(file_name_no_ext)

    # left_long_sess_list = [sess_name for sess_name in long_sess_list if sess_name not in total_sess_list]
    # left_subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(left_long_sess_list)

    # save_file_contents_list(
    #     valid_bmi_file_list,
    #     [sess_name + '.nii.gz' for sess_name in total_sess_list]
    # )
    subj_with_valid_bmi_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(
        total_sess_list)
    print(
        f'# Total consistent sess: {len(total_sess_list)} ({len(file_name_list)})'
    )
    print(
        f'# Total subjects with consistent sess: {len(subj_with_valid_bmi_list)} ({len(subj_all_list)})'
    )

    file_name_include_total = [
        sess_name + '.nii.gz' for sess_name in total_sess_list
    ]

    save_file_contents_list(out_include_bmi_list, file_name_include_total)

    file_excluded_total = [
        sess_name + '.nii.gz' for sess_name in file_name_no_ext
        if sess_name not in total_sess_list
    ]
    save_file_contents_list(out_exclude_bmi_list, file_excluded_total)

    return file_name_include_total, file_excluded_total
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Get the file list for a specified gender')
    parser.add_argument('--total-file-list',
                        type=str,
                        help='Only to filter out the files in this txt')
    parser.add_argument('--clinical-label-xlsx',
                        type=str,
                        help='Label file for clinical information')
    parser.add_argument('--gender-str',
                        type=str,
                        help='The label for gender type')
    parser.add_argument('--out-file-list-txt',
                        type=str,
                        help='Path to output file list txt file')
    args = parser.parse_args()

    clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx(
        args.clinical_label_xlsx)
    in_file_list = read_file_contents_list(args.total_file_list)
    out_list = clinical_data_reader.filter_sublist_with_label(
        in_file_list, 'sex', args.gender_str)
    write_list_to_file(out_list, args.out_file_list_txt)
def analysis_correlation(args):
    result_df = pd.read_csv(args.out_csv)
    result_df = result_df.set_index('file_name')

    file_list = read_file_contents_list(args.file_list_txt)
    clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv(
        in_clinical_csv)
    bmi_array, valid_file_name_list = clinical_reader.get_gt_value_BMI(
        file_list)

    valid_result_df = result_df.loc[valid_file_name_list]
    # valid_result_df['bmi'] = bmi_array

    valid_mean_list = valid_result_df['mean'].to_numpy()

    print(pearsonr(bmi_array, valid_mean_list))

    slope, intercept, r_value, p_value, std_err = linregress(
        bmi_array, valid_mean_list)
    reg_val = intercept + slope * bmi_array

    out_png = os.path.join('/nfs/masi/xuk9/SPORE/CAC_class/data',
                           'bmi_mean_lung.png')

    fig, ax = plt.subplots(figsize=(10, 7))
    ax.scatter(bmi_array, valid_mean_list, label=f'Samples')
    ax.plot(bmi_array,
            reg_val,
            color='r',
            label=f'Slope={slope:.3f}, p-value={p_value:.3E}')
    ax.set_xlabel('BMI ($kg/m^2$)')
    ax.set_ylabel('Averaged intensity (HU) in lung region')

    ax.legend(loc='best')

    plt.savefig(out_png, bbox_inches='tight', pad_inches=0.1)
    plt.close()
def main():
    file_list = read_file_contents_list(file_list_txt)

    clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv(
        in_csv_file)
    label_list = clinical_data_reader.get_label_for_obese(file_list)
    data_tuples = list(zip(file_list, label_list))
    label_df = pd.DataFrame(data_tuples, columns=['scan', 'label'])

    classifier_obj = MinibatchLinearClassifierWithCV.create_classifier_obj(
        in_folder, file_list, num_fold, label_df, batch_size)

    save_bin_path = path.join(proj_folder, 'model.bin')
    if if_run_training:
        classifier_obj.train()
        classifier_obj.validate()
        # classifier_obj.train_first_fold()
        # save_object(classifier_obj, save_bin_path)

    if if_run_validation:
        classifier_obj = load_object(save_bin_path)
        classifier_obj.valid_first_fold()
        auc_roc_first_fold = classifier_obj.validation_result[0]['roc_auc']
        print(f'auc_roc of fold 0: {auc_roc_first_fold}')
Beispiel #11
0
def get_data_dict(config, file_list_txt):
    task = config['task']
    in_folder = config['input_img_dir']
    label_csv = config['label_csv']

    in_folder_obj = DataFolder(in_folder,
                               read_file_contents_list(file_list_txt))
    file_list = in_folder_obj.get_data_file_list()

    clinical_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv(
        label_csv)

    label_array = None
    file_list_with_valid_label = None

    if task == 'BMI':
        label_array, file_list_with_valid_label = clinical_reader.get_gt_value_BMI(
            file_list)

    subject_list = [
        ClinicalDataReaderSPORE._get_subject_id_from_file_name(file_name)
        for file_name in file_list_with_valid_label
    ]

    in_folder_obj.set_file_list(file_list_with_valid_label)
    file_path_list = in_folder_obj.get_file_path_list()

    data_dict = {
        'img_names': file_list_with_valid_label,
        'img_subs': subject_list,
        'img_files': file_path_list,
        'gt_val': label_array
    }

    if config['add_jacobian_map']:
        in_jacobian_folder = config['input_jac_dir']
        in_jacobian_folder_obj = DataFolder(in_jacobian_folder,
                                            file_list_with_valid_label)
        jacobian_map_path_list = in_jacobian_folder_obj.get_file_path_list()
        data_dict['jacobian_maps'] = jacobian_map_path_list

    if config['add_valid_mask_map'] | config['apply_random_valid_mask']:
        in_valid_mask_folder = config['input_valid_mask_dir']
        in_valid_mask_folder_obj = DataFolder(in_valid_mask_folder,
                                              file_list_with_valid_label)
        valid_mask_path_list = in_valid_mask_folder_obj.get_file_path_list()
        data_dict['valid_masks'] = valid_mask_path_list

    if config['add_d_index_map']:
        in_d_index_map_folder = config['input_d_index_dir']
        in_d_index_map_folder_obj = DataFolder(in_d_index_map_folder,
                                               file_list_with_valid_label)
        d_index_map_path_list = in_d_index_map_folder_obj.get_file_path_list()
        data_dict['d_index_maps'] = d_index_map_path_list

    if config['add_jac_elem_maps']:
        in_jac_elem_folder = config['input_jac_elem_dir']
        in_jac_elem_folder_obj = DataFolder(in_jac_elem_folder,
                                            file_list_with_valid_label)
        for idx_elem in range(9):
            in_jac_elem_path_list = [
                map_path.replace('.nii.gz', f'_{idx_elem}.nii.gz')
                for map_path in in_jac_elem_folder_obj.get_file_path_list()
            ]
            data_dict[f'jac_elem_{idx_elem}_map'] = in_jac_elem_path_list

    return data_dict
Beispiel #12
0
 def _get_file_list(file_list_txt):
     return read_file_contents_list(file_list_txt)
def analyze_the_temporal_consistency_check(attr_flag):
    # Analayiss
    label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_csv(
        out_height_weight_added_csv)

    file_name_list = read_file_contents_list(valid_bmi_file_list)
    file_name_list = [f'{file_name}.nii.gz' for file_name in file_name_list]
    sess_list_all = [
        file_name.replace('.nii.gz', '') for file_name in file_name_list
    ]
    # inconsistency_data_dict = label_obj.temporal_consistency_check(attr_flag, file_name_list)

    longitudinal_data = get_longitudinal_info_in_raw_label_data()
    inconsistency_data_dict = label_obj.temporal_consistency_check_using_raw_label(
        attr_flag, longitudinal_data, file_name_list)

    out_png = os.path.join('/nfs/masi/xuk9/SPORE/CAC_class/clinical',
                           f'inconsistency_hist_{attr_flag}.png')

    inconsistency_list = np.array([
        inconsistency_data_dict[sess]['inconsistent_score']
        for sess in inconsistency_data_dict
    ])

    percentile_pos = 95

    percentile_val = np.percentile(inconsistency_list, percentile_pos)

    hist_plot_with_95_percentile(inconsistency_list, percentile_pos,
                                 percentile_val, out_png)

    # Return the inconsistent session name list

    sess_list = [sess for sess in inconsistency_data_dict]
    inconsistency_idx_list = np.argwhere(
        inconsistency_list > percentile_val)[:, 0]

    # sort the inconsistent cases
    score_list_inconsistency_only = inconsistency_list[inconsistency_idx_list]
    sorted_decending_idx_list = np.argsort(score_list_inconsistency_only)[::-1]

    sess_list_inconsistency_only = [
        sess_list[idx] for idx in inconsistency_idx_list
    ]
    sess_list_inconsistency_only = [
        sess_list_inconsistency_only[idx] for idx in sorted_decending_idx_list
    ]

    # print(sess_list_inconsistency_only[:10])
    subj_list = ClinicalDataReaderSPORE._get_subj_list_from_sess_list(
        sess_list_inconsistency_only)
    subj_spore_format_list = [f'SPORE_{subj_id:08d}' for subj_id in subj_list]

    out_inconsist_subj_data_file = open(out_inconsist_subj_data, 'w')
    # long_data_subj_inconsist_only = {}
    for subj_spore_format in subj_spore_format_list:
        out_inconsist_subj_data_file.write(f'{subj_spore_format}\n')
        subj_data = longitudinal_data[subj_spore_format]
        bmi_array = subj_data['bmi']
        out_inconsist_subj_data_file.write(f'{bmi_array} \n')
        score_array = []
        for bmi_val in bmi_array:
            abs_shift = np.abs(bmi_array - bmi_val)
            sorted_abs_shift = np.sort(abs_shift)
            score_array.append(sorted_abs_shift[1])
        score_array = np.array(score_array)
        out_inconsist_subj_data_file.write(f'{score_array} \n')
        out_inconsist_subj_data_file.write('\n')
    out_inconsist_subj_data_file.close()

    consistency_idx_list = np.argwhere(inconsistency_list <= percentile_val)[:,
                                                                             0]
    consist_sess = [sess_list[idx] for idx in consistency_idx_list]

    non_long_sess = [
        sess_name for sess_name in sess_list_all if sess_name not in sess_list
    ]

    return consist_sess, sess_list_inconsistency_only, percentile_val, non_long_sess