def compute_ROIs( generate_csv=False, version=0, patientTxtPath='/media/shared/datasets/LUNA/CSVFILES/patients_train.txt', mode='train'): if generate_csv: with open('ROIs_v{}_{}.csv'.format(version, mode), 'w') as f: f.write('patient,nodules,detected_regions') # ## PATIENTS FILE LIST // not really useful, remove filter_annotated = False patients_with_annotations = pd.read_csv( NODULES_PATH ) # filter patients with no annotations to avoid having to read them patients_with_annotations = list( set(patients_with_annotations['seriesuid'])) patients_with_annotations = [ "luna_%s.npz" % p.split('.')[-1] for p in patients_with_annotations ] filenames = [] with open(patientTxtPath, 'r') as f: for line in f: filenames.append(line.strip()) filenames = [ os.path.join(INPUT_PATH, fp) for fp in filenames if fp in patients_with_annotations or not filter_annotated ] filenames = filter(os.path.isfile, filenames) def __load_and_store(filename): patient_data = np.load(filename)['arr_0'] patient_data = patient_data.astype(np.int16) X, y, rois, stats = common.load_patient(patient_data, discard_empty_nodules=True, output_rois=True, debug=True, include_ground_truth=True, thickness=1) if not stats: stats = {'tp': 0, 'fn': 0, 'fp': 0} logging.info("Patient: %s, stats: %s" % (filename.split('/')[-1], stats)) if generate_csv: with open('ROIs_v{}_{}.csv'.format(version, mode), 'a') as f: f.write('{},{},{}\n'.format( filename.split('/')[-1][:-4], stats['tp'], sum(stats.values()))) # If the patient is empty return X, y, stats common.multiproc_crop_generator( filenames, os.path.join(PATCHES_PATH, 'dl1_v{}_x_{}.npz'.format(version, mode)), os.path.join(PATCHES_PATH, 'dl1_v{}_y_{}.npz'.format(version, mode)), __load_and_store)
def compute_ROIs(generate_csv=False, version_dl2 = 0, SCORE_TH = 0.5, patientTxtPath = '/media/shared/datasets/LUNA/CSVFILES/patients_train.txt', mode = 'train', dataset = 'luna'): """Loads the output of DL-I and load just the 1's (TP or FN's) and the FP's for a given score (SCORE_TH) to train DL-II""" if generate_csv: with open('/home/shared/output/ROIs_dl2_v{}_{}.csv'.format(version_dl2, mode), 'w') as f: f.write('patient,nodules,detected_regions') # ## PATIENTS FILE LIST // not really useful, remove if mode == 'train': nodules_df = pd.read_csv(OUTPUT_DL1) else: nodules_df = pd.read_csv(OUTPUT_DL1_TEST) nodules_df = nodules_df[(nodules_df['score'] > SCORE_TH) | (nodules_df['label']==1)] nodules_df['nslice'] = nodules_df['nslice'].astype(int) logging.info("Shape nodules df: %s" % str(nodules_df.shape)) patients = [ p + '.npz' for p in set(nodules_df['patientid'])] files = [] with open(patientTxtPath, 'r') as f: for line in f: files.append(line.strip()) if dataset == 'isbi': patients = ['/media/shared/datasets/ISBI/preprocessedNew/' + p for p in patients] filenames = [fp for fp in files if fp in patients] else: filenames = [os.path.join(INPUT_PATH, fp) for fp in files if fp in patients] def __load_and_store(filename): patient_data = np.load(filename)['arr_0'].astype(np.int16) ndf = nodules_df[nodules_df['patientid']==filename.split('/')[-1].split('.')[0]] X, y, rois, stats = common.load_patient(patient_data, ndf, output_rois=True, thickness=1) if not stats: stats = {'tp' : 0, 'fn' : 0, 'fp' : 0} logging.info("Patient: %s, stats: %s" % (filename.split('/')[-1], stats)) if generate_csv: with open('/home/shared/output/ROIs_dl2_v{}_{}_{}.csv'.format(version_dl2, mode, dataset), 'a') as f: f.write('{},{},{}\n'.format(filename.split('/')[-1][:-4], stats['tp'], sum(stats.values()))) return X, y, stats common.multiproc_crop_generator(filenames, os.path.join(PATCHES_PATH,'dl2_v{}_x_{}_{}.npz'.format(version_dl2, mode, dataset)), os.path.join(PATCHES_PATH,'dl2_v{}_y_{}_{}.npz'.format(version_dl2, mode, dataset)), __load_and_store)
filenames_train = [os.path.join(INPUT_PATH, fp) for fp in filenames if fp in patients_with_annotations] filenames_test = [os.path.join(INPUT_PATH, fp) for fp in filenames_t if fp in patients_with_annotations] def __load_and_store(filename): patient_data = np.load(filename)['arr_0'] patient_data = patient_data.astype(np.int16) X, y, rois, stats = common.load_patient(patient_data, discard_empty_nodules=True, output_rois=True, debug=True, include_ground_truth=True, thickness=1) logging.info("Patient: %s, stats: %s" % (filename.split('/')[-1], stats)) return X, y, stats common.multiproc_crop_generator(filenames_train, os.path.join(PATCHES_PATH, 'dl1_v2_x_train.npz'), os.path.join(PATCHES_PATH, 'dl1_v2_y_train.npz'), __load_and_store) common.multiproc_crop_generator(filenames_test, os.path.join(PATCHES_PATH, 'dl1_v2_x_test.npz'), os.path.join(PATCHES_PATH, 'dl1_v2_y_test.npz'), __load_and_store) ======= ### PATCHES GENERATION ----------------------------------------------------------------- # # ## PATIENTS FILE LIST # patients_with_annotations = pd.read_csv(NODULES_PATH) # filter patients with no annotations to avoid having to read them # patients_with_annotations = list(set(patients_with_annotations['seriesuid'])) # patients_with_annotations = ["luna_%s.npz" % p.split('.')[-1] for p in patients_with_annotations] # # filenames = os.listdir(INPUT_PATH)
patid = filename.split('/')[-1] ndf = nodules_df[nodules_df['patientid'] == patid] X, y, rois, stats = common.load_patient(patient_data, ndf, output_rois=True, thickness=1) label = int(label_df[label_df['id'] == patid]['cancer']) y = [label] * len(X) logging.info("Patient: %s, cancer:%d, stats: %s" % (patid, label, stats)) return X, y, stats common.multiproc_crop_generator(filenames_train, os.path.join(PATCHES_PATH, 'dl3_v1_x_train.npz'), os.path.join(PATCHES_PATH, 'dl3_v1_y_train.npz'), __load_and_store, parallel=True) common.multiproc_crop_generator(filenames_test, os.path.join(PATCHES_PATH, 'dl3_v1_x_test.npz'), os.path.join(PATCHES_PATH, 'dl3_v1_y_test.npz'), __load_and_store, parallel=True) ### TRAINING ------------------------------------------------------------------------------------------------------- # Data augmentation generator