def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--load-dr-bin', type=str) parser.add_argument('--component-id', type=int) parser.add_argument('--save-bin-png', type=str) parser.add_argument('--save-csv', type=str) args = parser.parse_args() dr_data = load_object(args.load_dr_bin) file_list = dr_data['file_list'] projected_data = dr_data['projected_matrix'] component_val_array = projected_data[:, args.component_id - 1] component_dict_array = [{ 'Scan': file_list[file_idx], 'Value': component_val_array[file_idx] } for file_idx in range(len(file_list))] plt.hist(component_val_array) plt.savefig(args.save_bin_png) df = pd.DataFrame(component_dict_array) logger.info(f'Save csv to {args.save_csv}') df.to_csv(args.save_csv, index=False)
def main(): parser = argparse.ArgumentParser( description='Eliminate the 1D subspace that correspond to BMI') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--in-feature-dim', type=int, default=20) parser.add_argument('--out-data-dict-bin', type=str) args = parser.parse_args() in_dict_obj = load_object(args.in_data_dict_bin) scan_name_list = list(in_dict_obj.keys()) data_X = np.zeros((len(scan_name_list), args.in_feature_dim), dtype=float) data_Y = np.zeros((len(scan_name_list), ), dtype=float) for idx_scan in range(len(scan_name_list)): scan_name = scan_name_list[idx_scan] data_X[idx_scan, :] = in_dict_obj[scan_name]['ImageData'][:] data_Y[idx_scan] = in_dict_obj[scan_name]['bmi'] linear_reg_obj = EigenThoraxLinearRegression1D(data_X, data_Y) linear_reg_obj.run_regression() projected_data_X = linear_reg_obj.project_to_complement_space() for idx_scan in range(len(scan_name_list)): scan_name = scan_name_list[idx_scan] in_dict_obj[scan_name]['ImageData'] = projected_data_X[idx_scan, :] save_object(in_dict_obj, args.out_data_dict_bin)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--low-dim-bin-path', type=str) parser.add_argument('--save-bin-path', type=str) parser.add_argument('--num-pca-component', type=int, default=10) parser.add_argument('--dim-embedded', type=int, default=2) args = parser.parse_args() logger.info(f'Load low dim data from {args.low_dim_bin_path}') low_dim_array = load_object(args.low_dim_bin_path) data_matrix = np.zeros((len(low_dim_array), args.num_pca_component)) for sample_idx in range(len(low_dim_array)): data_matrix[sample_idx, :] = low_dim_array[sample_idx]['low_dim'][:] logger.info(f'Num of sample: {data_matrix.shape[0]}') logger.info(f'Num of included PCs: {data_matrix.shape[1]}') logger.info('Start tSNE') # embedded_matrix = TSNE(perplexity=50, learning_rate=10000, n_components=args.dim_embedded).fit_transform(data_matrix) embedded_matrix = TSNE(perplexity=50, n_iter=100000, n_components=args.dim_embedded).fit_transform( data_matrix) # embedded_matrix = TSNE(perplexity=50, learning_rate=10000, n_components=args.dim_embedded).fit_transform( # data_matrix) logger.info('Complete') logger.info(f'Output shape: {embedded_matrix.shape}') for sample_idx in range((len(low_dim_array))): low_dim_array[sample_idx]['tsne_data'] = embedded_matrix[sample_idx, :] # logger.info(low_dim_array[0]) logger.info(f'Save data to {args.save_bin_path}') save_object(low_dim_array, args.save_bin_path)
def load_data(self, in_res_matrix_path, num_res_pc, in_jac_matrix_path, num_jac_pc): self._in_res_matrix_obj = load_object(in_res_matrix_path) self._num_res_pc = num_res_pc self._in_jac_matrix_obj = load_object(in_jac_matrix_path) self._num_jac_pc = num_jac_pc self._file_list = self._in_res_matrix_obj['file_list'] num_dim = num_res_pc + num_jac_pc num_sample = self._in_res_matrix_obj['projected_matrix'].shape[0] self._use_data_matrix = np.zeros((num_sample, num_dim)) self._use_data_matrix[:, :self._num_res_pc] = self._in_res_matrix_obj[ 'projected_matrix'][:, :self._num_res_pc] self._use_data_matrix[:, self. _num_res_pc:num_dim] = self._in_jac_matrix_obj[ 'projected_matrix'][:, :self._num_jac_pc]
def load_data(self, in_data_matrix_bin_path, num_pc): logger.info(f'Load bin data file {in_data_matrix_bin_path}') self._data_obj = load_object(in_data_matrix_bin_path) self._num_pc = num_pc self._file_list = self._data_obj['file_list'] self._use_data_matrix = self._data_obj['projected_matrix'][:, :self. _num_pc]
def main(): parser = argparse.ArgumentParser(description='KMean clustering analysis') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--n-features', type=int) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) kmean_analyzer = ClusterAnalysisDimAnalyzer(in_data_dict, args.n_features) kmean_analyzer.get_optimal_AMI_cancer_first_year()
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--out-data-dict-bin', type=str) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) ica_obj = RunICA(in_data_dict) ica_obj.run_ica() ica_obj.save_data_dict_bin(args.out_data_dict_bin)
def main(): parser = argparse.ArgumentParser( description='Eliminate the 1D subspace that correspond to BMI') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--in-feature-dim', type=int, default=20) parser.add_argument('--out-data-dict-bin', type=str) args = parser.parse_args() in_dict_obj = load_object(args.in_data_dict_bin) fs_obj = FSDimReduction1D(in_dict_obj, args.in_feature_dim) fs_obj.run_dim_reduct('Age') fs_obj.save_bin(args.out_data_dict_bin)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--out-data-dict-bin', type=str) args = parser.parse_args() low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx(args.label_file) data_dict = generate_data_dict(low_dim_array, label_obj) logger.info(f'Save dict data object to {args.out_data_dict_bin}') save_object(data_dict, args.out_data_dict_bin)
def main(): parser = argparse.ArgumentParser(description='KMean clustering analysis') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--n-features', type=int) parser.add_argument('--out-png-folder', type=str) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) kmean_analyzer = ClusterAnalysisDimAnalyzer(in_data_dict, args.n_features) kmean_analyzer.run_meta_data_kmeans(['bmi', 'Age', 'Packyear'], 'CancerSubjectFirstScan', 10, args.out_png_folder)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--out-data-csv', type=str) args = parser.parse_args() out_csv = args.out_data_csv low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.label_file) generate_effective_data_csv(low_dim_array, label_obj, out_csv)
def run_dimension_reduction(self, save_bin_path): pca_nii_3d = PCA_NII_3D(None, None, 1) pca_nii_3d.load_pca(self._pca_bin_path) image_feature_data_obj = load_object(self._data_bin_path) projected_matrix = pca_nii_3d._get_pca().transform(image_feature_data_obj['data_matrix']) out_data = { 'file_list': image_feature_data_obj['file_list'], 'projected_matrix': projected_matrix } save_object(out_data, save_bin_path)
def main(): parser = argparse.ArgumentParser(description='KMean clustering analysis') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--n-features', type=int) parser.add_argument('--out-png-folder', type=str) parser.add_argument('--n-cluster', type=int, default=10) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) kmean_analyzer = ClusterAnalysisDimAnalyzer(in_data_dict, args.n_features) kmean_analyzer.plot_kmean_n_cluster_field_list_cancer_subject_first_scan( ['CancerSubjectFirstScan', 'COPD', 'Coronary Artery Calcification', 'Age', 'Packyear', 'bmi'], args.n_cluster, args.out_png_folder)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--n-features', type=int) parser.add_argument('--out-png-folder', type=str) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) data_dict_obj = ClusterAnalysisDataDict(in_data_dict, args.n_features) optimal_cluster_num_obj = ClusterAnalysisSearchNumCluster(data_dict_obj) out_elbow_png = os.path.join(args.out_png_folder, 'elbow_plot.png') out_silhouette_png = os.path.join(args.out_png_folder, 'silhouette_plot.png') optimal_cluster_num_obj.ElbowSilhouettePlot(out_elbow_png, out_silhouette_png)
def main(): file_list = load_object(in_feature_matrix_bin)['file_list'] # file_list = read_file_contents_list(female_file_list) subject_list = ClinicalDataReaderSPORE.get_subject_list(file_list) reader_obj = ClinicalDataReaderSPORE.create_spore_data_reader_csv(spore_csv) ori_spore_label_df = pd.read_excel(ori_spore_excel) reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'copd') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'Coronary Artery Calcification') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'race') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'LungRADS') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'smokingstatus') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'packyearsreported') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'education') reader_obj.get_attributes_from_original_label_file(ori_spore_label_df, 'cancer_bengin') reader_obj.get_summary_characteristics_subject(subject_list)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-pca-data-bin', type=str) parser.add_argument('--out-png-folder', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--data-csv', type=str, default=None) # parser.add_argument('--low-dim-data-flag', type=str, default='low_dim') args = parser.parse_args() out_csv = os.path.join(args.out_png_folder, 'data_full.csv') low_dim_array = load_object(args.in_pca_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx( args.label_file) PlotCorrAnalyzeLDA.generate_effective_data_csv(low_dim_array, label_obj, out_csv) plot_obj = PlotCorrAnalyzeLDA.create_class_object_w_csv(out_csv)
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-data-dict-bin', type=str) parser.add_argument('--n-features', type=int) parser.add_argument('--out-png-folder', type=str) args = parser.parse_args() in_data_dict = load_object(args.in_data_dict_bin) data_dict_obj = ClusterAnalysisDataDict(in_data_dict, args.n_features) # corr_analysis_obj = CorrelationAnalysis(data_dict_obj) # max_2_bmi = corr_analysis_obj.correlation_bar_plot('bmi', args.out_png_folder) # max_2_age = corr_analysis_obj.correlation_bar_plot('Age', args.out_png_folder) # max_2_packyear = corr_analysis_obj.correlation_bar_plot('Packyear', args.out_png_folder) # # corr_analysis_obj.mutual_info_bar_plot('bmi', args.out_png_folder) # corr_analysis_obj.mutual_info_bar_plot('Age', args.out_png_folder) # corr_analysis_obj.mutual_info_bar_plot('Packyear', args.out_png_folder) # corr_analysis_obj.plot_2D_dim_plot(max_2_bmi, 'bmi', args.out_png_folder) # corr_analysis_obj.plot_2D_dim_plot(max_2_age, 'Age', args.out_png_folder) # corr_analysis_obj.plot_2D_dim_plot(max_2_packyear, 'Packyear', args.out_png_folder) corr_analysis_ortho_obj = CorrelationAnalysis2OrthoSpace(data_dict_obj) # corr_analysis_ortho_obj.plot_2D_top_dim_ortho('bmi', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_ortho('Age', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_ortho('Packyear', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_lda_ortho('CAC', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_lda_ortho('COPD', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_lda_ortho('CancerSubjectFirstScan', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_top_dim_ortho('CancerSubjectFirstScan', args.out_png_folder) # corr_analysis_ortho_obj.plot_2D_grid_pack_field_list(args.out_png_folder) corr_analysis_ortho_obj.plot_2D_grid_pack_field_tsne_list( args.out_png_folder)
def main(): file_list = read_file_contents_list(file_list_txt) clinical_data_reader = ClinicalDataReaderSPORE.create_spore_data_reader_csv( in_csv_file) label_list = clinical_data_reader.get_label_for_obese(file_list) data_tuples = list(zip(file_list, label_list)) label_df = pd.DataFrame(data_tuples, columns=['scan', 'label']) classifier_obj = MinibatchLinearClassifierWithCV.create_classifier_obj( in_folder, file_list, num_fold, label_df, batch_size) save_bin_path = path.join(proj_folder, 'model.bin') if if_run_training: classifier_obj.train() classifier_obj.validate() # classifier_obj.train_first_fold() # save_object(classifier_obj, save_bin_path) if if_run_validation: classifier_obj = load_object(save_bin_path) classifier_obj.valid_first_fold() auc_roc_first_fold = classifier_obj.validation_result[0]['roc_auc'] print(f'auc_roc of fold 0: {auc_roc_first_fold}')
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--in-data-bin', type=str) parser.add_argument('--out-png-folder', type=str) parser.add_argument('--label-file', type=str) parser.add_argument('--data-csv', type=str, default=None) parser.add_argument('--low-dim-data-flag', type=str, default='low_dim') args = parser.parse_args() plot_obj = None if args.data_csv is not None: plot_obj = PlotSpacePCA.create_class_object_w_csv(args.data_csv) else: low_dim_array = load_object(args.in_data_bin) label_obj = ClinicalDataReaderSPORE.create_spore_data_reader_xlsx(args.label_file) plot_obj = PlotSpacePCA.create_class_object_w_data(low_dim_array, label_obj, args.low_dim_data_flag) out_csv = os.path.join(args.out_png_folder, 'data.csv') plot_obj.save_label_file(out_csv) plot_obj.plot_copd(os.path.join(args.out_png_folder, 'copd.png')) plot_obj.plot_age(os.path.join(args.out_png_folder, 'age.png')) plot_obj.plot_packyear(os.path.join(args.out_png_folder, 'packyear.png')) plot_obj.plot_ca_cal(os.path.join(args.out_png_folder, 'ca_cal.png')) plot_obj.plot_bmi(os.path.join(args.out_png_folder, 'bmi.png'))
def main(): parser = argparse.ArgumentParser(description='Load a saved pca object') parser.add_argument('--bin-folder', type=str) parser.add_argument('--out-png-folder', type=str) args = parser.parse_args() bin_data_dict_path_list = [] bin_data_dict_name_list = [] bin_data_dict_n_feature = [] bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'init_data_dict.bin')) bin_data_dict_name_list.append('original (#dim=20)') bin_data_dict_n_feature.append(20) # bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'reduct_bmi_data_dict.bin')) # bin_data_dict_name_list.append('reduce BMI (#dim=19)') # bin_data_dict_n_feature.append(19) # bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'reduct_bmi_2_data_dict.bin')) # bin_data_dict_name_list.append('reduce BMI (#dim=18)') # bin_data_dict_n_feature.append(18) # bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'reduct_age_1_data_dict.bin')) # bin_data_dict_name_list.append('reduce Age (#dim=17)') # bin_data_dict_n_feature.append(16) # bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'reduct_packyear_1_data_dict.bin')) # bin_data_dict_name_list.append('reduce Packyear (#dim=16)') # bin_data_dict_n_feature.append(15) bin_data_dict_path_list.append(os.path.join(args.bin_folder, 'reduct_packyear_1_data_dict.bin')) bin_data_dict_name_list.append('reduce BMI, Age and Packyear (#dim=15)') bin_data_dict_n_feature.append(15) num_bin_data = 2 n_cluster_range = range(1, 11) fig, ax = plt.subplots(figsize=(20, 14)) gs = gridspec.GridSpec(2, 2) ax_list = [] for idx_ax in range(4): ax_list.append(plt.subplot(gs[idx_ax])) idx_ax = 0 for idx_bin_data in range(num_bin_data): bin_data_dict = load_object(bin_data_dict_path_list[idx_bin_data]) bin_data_name = bin_data_dict_name_list[idx_bin_data] bin_data_num_features = bin_data_dict_n_feature[idx_bin_data] data_dict_obj = ClusterAnalysisDataDict(bin_data_dict, bin_data_num_features) optimal_cluster_num_obj = ClusterAnalysisSearchNumCluster(data_dict_obj) elbow_list, silhouette_list = optimal_cluster_num_obj.get_elbow_and_silhouette_array() ax_list[idx_ax].plot(n_cluster_range, elbow_list, label=bin_data_name) ax_list[idx_ax].set_title('Sum of squared distance to cluster centroids') idx_ax += 1 ax_list[idx_ax].plot(n_cluster_range[1:], silhouette_list[1:], label=bin_data_name) ax_list[idx_ax].set_title('Silhouette score') idx_ax += 1 for idx_ax in range(4): ax_list[idx_ax].legend(loc='best') out_png = os.path.join(args.out_png_folder, 'optimal_num_cluster.png') logger.info(f'Save to {out_png}') fig.tight_layout() plt.savefig(out_png) plt.close()
def load_data(self): self._data_obj = load_object(self._data_bin_path)
def load_pca(self, bin_path): print(f'Loading pca from ${bin_path}', flush=True) self._pca = load_object(bin_path)