def getAtts(nods, pid): assert type(nods) is dict scan = pl.query( pl.Scan).filter(pl.Scan.patient_id == 'LIDC-IDRI-{}'.format( str(pid).zfill(4))).first() image = preprocess(scan) lung_mask = segment_lung_mask(image, True) x1, x2, y1, y2, z1, z2 = create_bounding_box(lung_mask) shape = scan.to_volume().shape allNodules = scan.cluster_annotations() nodules = [allNodules[i][0] for i in range(len(allNodules)) ] #weakness -- only using one annotation when four is provided try: assert len(nodules) == len(allNodules) assert len(nodules) == len(nods) for a in range(len(nodules)): nod_mask, nod_bbox, ann_masks = consensus(allNodules[a]) scan_vol = np.zeros(shape) scan_vol[nod_bbox] = nod_mask temp_vol = reshape_mask(scan_vol, image) final_vol = temp_vol[x1:x2, y1:y2, z1:z2] for b in range(len(nods)): numIntersect = np.sum(final_vol & nods[b].array) if numIntersect == np.sum(final_vol): nods[b].subtlety = int(nodules[a].subtlety) nods[b].internalStructure = int( nodules[a].internalStructure) nods[b].calcification = int(nodules[a].calcification) nods[b].sphericity = int(nodules[a].sphericity) nods[b].margin = int(nodules[a].margin) nods[b].lobulation = int(nodules[a].lobulation) nods[b].spiculation = int(nodules[a].spiculation) nods[b].texture = int(nodules[a].texture) nods[b].malignancy = int(nodules[a].malignancy) except: print('pid: {}, nodules numbers do not match.'.format(pid)) for a in range(len(nodules)): nod_mask, nod_bbox, ann_masks = consensus(allNodules[a]) scan_vol = np.zeros(shape) scan_vol[nod_bbox] = nod_mask temp_vol = reshape_mask(scan_vol, image) final_vol = temp_vol[x1:x2, y1:y2, z1:z2] for b in range(len(nods)): numIntersect = np.sum(final_vol & nods[b].array) if numIntersect == np.sum( final_vol) or numIntersect >= 0.9 * np.sum(final_vol): nods[b].subtlety = int(nodules[a].subtlety) nods[b].internalStructure = int( nodules[a].internalStructure) nods[b].calcification = int(nodules[a].calcification) nods[b].sphericity = int(nodules[a].sphericity) nods[b].margin = int(nodules[a].margin) nods[b].lobulation = int(nodules[a].lobulation) nods[b].spiculation = int(nodules[a].spiculation) nods[b].texture = int(nodules[a].texture) nods[b].malignancy = int(nodules[a].malignancy)
def main(): args = parser.parse_args() path = pathlib.Path(args.savedir) if args.debug: path = path / 'debug' print(f"Using {str(path)} as save directory") if path.exists() and path.is_dir(): warnings.warn(f"Directory {str(path)} already exists.") if args.overwrite: print("Overwrite has been set. Continuing...") else: print("Terminating execution.") return else: path.mkdir(parents=True, exist_ok=True) if args.debug: scans = [pl.query(pl.Scan).first()] else: scans = pl.query(pl.Scan).all() for scan in scans: print(f"Converting patient {scan.patient_id}") vol = scan.to_volume() # (numpy array) mask = np.zeros(vol.shape, dtype=bool) nodules = scan.cluster_annotations() for nod in nodules: # Pad so that cmask is the whole volume cmask, _, _ = consensus(nod, clevel=0.5, pad=[(vol.shape[i], vol.shape[i]) for i in range(3)]) mask = np.logical_or(mask, cmask) numpy_to_nifti(vol, path / f"{scan.patient_id}_volume.nii.gz") numpy_to_nifti(vol, path / f"{scan.patient_id}_segmask.nii.gz")
def get_anns_con(scan): vol = scan.to_volume() nods = scan.cluster_annotations() anns_mask = [] anns_bbox = [] for i, nod in enumerate(nods): temp_anns_mask, temp_bbox, nmasks = consensus(nod) anns_mask.append(np.multiply(temp_anns_mask, 1)) anns_bbox.append(temp_bbox) mask = np.zeros(vol.shape) for i in range(len(nods)): mask[anns_bbox[i]] = (anns_mask[i]) return mask
def get_image(nodule, vol): anns = nodule print(anns[0].malignancy, anns[0].Malignancy) _, cbbox, _ = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)]) # Get the central slice of the computed bounding box. k = int(0.5*(cbbox[2].stop - cbbox[2].start)) image = normalize(vol[cbbox][:, :, k]) image = zero_conter(image) return image
def prepare(pid): os.makedirs(os.path.join(cf.raw_data_dir, pid), exist_ok=True) # Get scan from pylidc scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() print("processing:", scan.patient_id) vol_shape = scan.to_volume().shape # Write scan nrrd scan_path = glob.glob(os.path.join(lidc_path, pid, "*", "*", f"{pid}_CT.nrrd"))[0] copyfile(scan_path, os.path.join(cf.raw_data_dir, pid, f"{pid}_CT.nrrd")) # Cluster the annotations for the scan, and grab one. nodules = scan.cluster_annotations() nodule_ix = 0 for nodule_anns in nodules: # Build 50% consensus mask cmask, cbbox, _ = consensus(nodule_anns, clevel=0.5) cmask_full = np.zeros(vol_shape) cmask_full[cbbox] = cmask # Load header from NRRD header = nrrd.read_header(scan_path) # Write consensus to nrrd cmask_full = np.swapaxes(cmask_full, 0, 1) nodule_id = f"{pid}_nod_{nodule_ix}" nrrd.write(os.path.join(cf.raw_data_dir, pid, f"{nodule_id}.nrrd"), cmask_full, header=header) nodule_ix = nodule_ix + 1
ct = 'LIDC-IDRI-' + str(i + 1) print("Pacient ID: ", str(ct)) try: # Query for a scan, and convert it to an array volume. scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == ct).first() # scan1 = pl.query(pl.Annotation).filter(pl.Annotation.texture == 1) vol = scan.to_volume() nodules = scan.cluster_annotations() for l in range(len(nodules)): annotations = nodules[l] consensus_mask, consensus_bbox, _ = consensus(annotations, clevel=0.5, pad=[(0, 0), (0, 0), (0, 0)]) k = consensus_mask.shape[-1] // 2 # Save image and mask image = np.asarray(vol[consensus_bbox][:, :, k]) mask = np.float32(np.array(consensus_mask[:, :, k])) img_sitk = sitk.GetImageFromArray(image) sitk.WriteImage( img_sitk, f"/home/anatielsantos/mestrado/bases/cortes-lidc/image/__vol{i}_nod{l}.nii" ) mask_sitk = sitk.GetImageFromArray(mask)
plt.show() #Annotation consensus ------------ # Query for a scan, and convert it to an array volume. scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() vol = scan.to_volume() # Cluster the annotations for the scan, and grab one. nods = scan.cluster_annotations() anns = nods[0] # Perform a consensus consolidation and 50% agreement level. # We pad the slices to add context for viewing. cmask, cbbox, masks = consensus(anns, clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)]) # Get the central slice of the computed bounding box. k = int(0.5 * (cbbox[2].stop - cbbox[2].start)) # Set up the plot. fig, ax = plt.subplots(1, 1, figsize=(5, 5)) ax.imshow(vol[cbbox][:, :, k], cmap=plt.cm.gray, alpha=0.5) # Plot the annotation contours for the kth slice. colors = ['r', 'g', 'b', 'y'] for j in range(len(masks)): for c in find_contours(masks[j][:, :, k].astype(float), 0.5): label = "Annotation %d" % (j + 1) plt.plot(c[:, 1], c[:, 0], colors[j], label=label)
import SimpleITK as sitk import numpy as np i = 0 for scan in pl.query(pl.Scan): # annotation_groups is a list of of lists of Annotation's annotation_groups = scan.cluster_annotations() vol = scan.to_volume() # Next, for each annotation group, implement your criteria of what qualifies as GGO. E.g., for nodule_annotations in annotation_groups: # Only consider nodules with 4 annotators and have >= 50% indicating GGO if (len(nodule_annotations) >= 2 and sum([a.texture == 1 for a in nodule_annotations]) >= 1): consensus_mask, consensus_bbox, _ = consensus(nodule_annotations, clevel=0.5, pad=[(5, 5), (5, 5), (0, 0)]) image = np.asarray(vol[consensus_bbox][:, :, :]).transpose(2, 0, 1) mask_image = np.float32(np.array( consensus_mask[:, :, :])).transpose(2, 0, 1) img_sitk = sitk.GetImageFromArray(image) sitk.WriteImage( img_sitk, f"/home/anatielsantos/mestrado/bases/cortes-lidc/3d/image/ggo/nod{i}.nii" ) mask_sitk = sitk.GetImageFromArray(mask_image) sitk.WriteImage( mask_sitk,
def prepare_dataset(self): # This is to name each image and mask prefix = [str(x).zfill(3) for x in range(1000)] # Make directory if not os.path.exists(self.img_path): os.makedirs(self.img_path) if not os.path.exists(self.mask_path): os.makedirs(self.mask_path) if not os.path.exists(self.clean_path_img): os.makedirs(self.clean_path_img) if not os.path.exists(self.clean_path_mask): os.makedirs(self.clean_path_mask) if not os.path.exists(self.meta_path): os.makedirs(self.meta_path) IMAGE_DIR = Path(self.img_path) MASK_DIR = Path(self.mask_path) CLEAN_DIR_IMAGE = Path(self.clean_path_img) CLEAN_DIR_MASK = Path(self.clean_path_mask) for patient in tqdm(self.IDRI_list): pid = patient #LIDC-IDRI-0001~ scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() nodules_annotation = scan.cluster_annotations() vol = scan.to_volume() print( "Patient ID: {} Dicom Shape: {} Number of Annotated Nodules: {}" .format(pid, vol.shape, len(nodules_annotation))) patient_image_dir = IMAGE_DIR / pid patient_mask_dir = MASK_DIR / pid Path(patient_image_dir).mkdir(parents=True, exist_ok=True) Path(patient_mask_dir).mkdir(parents=True, exist_ok=True) if len(nodules_annotation) > 0: # Patients with nodules for nodule_idx, nodule in enumerate(nodules_annotation): # Call nodule images. Each Patient will have at maximum 4 annotations as there are only 4 doctors # This current for loop iterates over total number of nodules in a single patient mask, cbbox, masks = consensus(nodule, self.c_level, self.padding) lung_np_array = vol[cbbox] # We calculate the malignancy information malignancy, cancer_label = self.calculate_malignancy( nodule) for nodule_slice in range(mask.shape[2]): # This second for loop iterates over each single nodule. # There are some mask sizes that are too small. These may hinder training. if np.sum(mask[:, :, nodule_slice]) <= self.mask_threshold: continue # Segment Lung part only lung_segmented_np_array = segment_lung( lung_np_array[:, :, nodule_slice]) # I am not sure why but some values are stored as -0. <- this may result in datatype error in pytorch training # Not sure lung_segmented_np_array[lung_segmented_np_array == -0] = 0 # This itereates through the slices of a single nodule # Naming of each file: NI= Nodule Image, MA= Mask Original nodule_name = "{}_NI{}_slice{}".format( pid[-4:], prefix[nodule_idx], prefix[nodule_slice]) mask_name = "{}_MA{}_slice{}".format( pid[-4:], prefix[nodule_idx], prefix[nodule_slice]) meta_list = [ pid[-4:], nodule_idx, prefix[nodule_slice], nodule_name, mask_name, malignancy, cancer_label, False ] self.save_meta(meta_list) np.save(patient_image_dir / nodule_name, lung_segmented_np_array) np.save(patient_mask_dir / mask_name, mask[:, :, nodule_slice]) else: print("Clean Dataset", pid) patient_clean_dir_image = CLEAN_DIR_IMAGE / pid patient_clean_dir_mask = CLEAN_DIR_MASK / pid Path(patient_clean_dir_image).mkdir(parents=True, exist_ok=True) Path(patient_clean_dir_mask).mkdir(parents=True, exist_ok=True) #There are patients that don't have nodule at all. Meaning, its a clean dataset. We need to use this for validation for slice in range(vol.shape[2]): if slice > 50: break lung_segmented_np_array = segment_lung(vol[:, :, slice]) lung_segmented_np_array[lung_segmented_np_array == -0] = 0 lung_mask = np.zeros_like(lung_segmented_np_array) #CN= CleanNodule, CM = CleanMask nodule_name = "{}/{}_CN001_slice{}".format( pid, pid[-4:], prefix[slice]) mask_name = "{}/{}_CM001_slice{}".format( pid, pid[-4:], prefix[slice]) meta_list = [ pid[-4:], slice, prefix[slice], nodule_name, mask_name, 0, False, True ] self.save_meta(meta_list) np.save(patient_clean_dir_image / nodule_name, lung_segmented_np_array) np.save(patient_clean_dir_mask / mask_name, lung_mask) print("Saved Meta data") self.meta.to_csv(self.meta_path + 'meta_info.csv', index=False)
if (nods[i][int(len(nods[i]) / 2)].malignancy == 2): print("unlikely cancerous") savefile = savefile + r"\unlikely cancerous" if (nods[i][int(len(nods[i]) / 2)].malignancy == 1): print("safe") savefile = savefile + r"\safe" if os.path.isdir(savefile) == False: os.makedirs(savefile) print(savefile) cmask, cbbox, masks = consensus(nods[i], clevel=0.5, pad=[(20, 20), (20, 20), (0, 0)]) k = int(0.5 * (cbbox[2].stop - cbbox[2].start)) fig, ax = plt.subplots(1, 1, figsize=(5, 5)) ax.imshow(vol[cbbox][:, :, k], cmap=plt.cm.gray, alpha=1.0) # Plot the annotation contours for the kth slice. colors = ['r', 'g', 'b', 'y'] for j in range(len(masks)): for c in find_contours(masks[j][:, :, k].astype(float), 0.5): label = "Annotation %d" % (j + 1) #plt.plot(c[:,1], c[:,0], colors[j], label=label) ax.axis('off') #ax.legend() plt.tight_layout()
def preprocess_lidc(src: Path, dest: Path, sample: Union[Sequence[str], bool] = False, nod_size: Tuple[int] = (100, 100, 60)): """Preprocesses the LIDC-IDRI dataset after being downloaded from TCIA. Args: src (Path): Path to directory where the DICOM folders reside. dest (Path): Path to which volumes, masks and metadata should be written. sample_size (int): Sample size. Mainly used for testing. Defaults to False nod_size (Tuple[int]): Size of extracted nodule volumes. Defaults to (100, 100, 60) pixels. """ img_path = dest / "images" img_path.mkdir(parents=True, exist_ok=True) mask_path = dest / "masks" mask_path.mkdir(parents=True, exist_ok=True) nod_path = dest / "nodules" nod_path.mkdir(parents=True, exist_ok=True) meta_path = dest / "meta" meta_path.mkdir(parents=True, exist_ok=True) pids = get_pids(src) scan_data = [] nod_data = [] if sample: pids = sample for pid in tqdm(pids): scan = get_scan(pid) scan_meta = get_scan_meta(scan) scan_data.append(scan_meta) vol = scan.to_volume(verbose=False) np.save(img_path / f"{pid}.npy", vol.astype(np.int16)) ann_clusters = scan.cluster_annotations(verbose=False) masks = [np.zeros(vol.shape, dtype=np.uint8)] for i, cluster in enumerate(ann_clusters): # pad whole image for segmentation mask pad_sz = int(np.max(vol.shape)) _, bbox = consensus(cluster, ret_masks=False) mask, _ = consensus(cluster, ret_masks=False, pad=pad_sz) # calc padding for nodule volume nod_pad_sz = [ (math.ceil(i / 2), math.floor(i / 2)) for i in (np.array(nod_size) - np.array(vol[bbox].shape)) ] _, pbbox = consensus(cluster, ret_masks=False, pad=nod_pad_sz) nod_vol = vol[pbbox] np.save(nod_path / f"{pid}_{i}.npy", nod_vol.astype(np.int16)) nod_meta = get_nod_meta(scan, cluster, i, bbox) nod_data.append(nod_meta) masks.append(mask) mask = reduce(np.logical_or, masks) np.save(mask_path / f"{pid}.npy", mask.astype(np.uint8)) scan_df = pd.DataFrame(data=scan_data) scan_df.to_csv(meta_path / "scans.csv", index=False) nod_df = pd.DataFrame(data=nod_data) nod_df.to_csv(meta_path / "nodules.csv", index=False) return
def __prepare_nodule_list(self, cluster_list: List[List[pylidc.Annotation]]): lidc_nodule_config = { "diam_interval": self.diam_interval, "extract_size_mm": self.extract_size_mm, "mask_dilation_iters": self.mask_dilation_iters, } nodule_pickle_exists = os.path.exists(self.nodule_list_pickle_path) snapshot_exists = config_snapshot( "lidc_nodule", lidc_nodule_config, "./src/data/aux/.lidcnod_config_snapshot.json") if not nodule_pickle_exists or not snapshot_exists: nodule_list = [] _tqdm_kwargs = { "desc": "Preparing LIDC nodule list", "total": len(cluster_list) } for i, cluster in tqdm(enumerate(cluster_list), **_tqdm_kwargs): # Check if all annotations belong to the same scan if len(np.unique([ann.scan.id for ann in cluster])) != 1: logger.warning( f"annotations not from the same scans! skip") continue nodule_diam = np.mean([ann.diameter for ann in cluster]) texture_scores = [ann.texture for ann in cluster] # Skip nodules out of diam interval and with amiguous texture scores if (nodule_diam < self.diam_interval[0] or nodule_diam >= self.diam_interval[1] or not_valid_score(texture_scores)): continue # Minimal possible bbox size (in mm). minsize = max([max(cl.bbox_dims(pad=None)) for cl in cluster]) pad_mm = max(float(self.extract_size_mm), minsize) nodule_mask, nodule_bbox = consensus(cluster, clevel=0.8, pad=pad_mm, ret_masks=False) dilated_nodule_mask = binary_dilation( nodule_mask, iterations=self.mask_dilation_iters) nodule_coords = np.mean([ann.centroid for ann in cluster], axis=0) nodule_diam = np.mean([ann.diameter for ann in cluster]) nodule_texture = mode(texture_scores).mode.item() nodule = LIDCNodule( pylidc_scan=cluster[0].scan, bbox=nodule_bbox, mask=dilated_nodule_mask, centroid=nodule_coords, diameter=nodule_diam, texture=nodule_texture, ) nodule_list.append(nodule) logger.info("pickling LIDC nodule list for future use") with open(self.nodule_list_pickle_path, "wb") as f: pickle.dump(nodule_list, f) else: with open(self.nodule_list_pickle_path, "rb") as f: nodule_list = pickle.load(f) return nodule_list
for i in range(1, 1011): pid = pid_prefix + str(i).zfill(4) # get scan scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() # get image try: vol = scan.to_volume().astype(np.int16) except: continue print(vol.shape) print(np.max(vol), np.min(vol)) # get nodule annotation nods = scan.cluster_annotations() label = np.zeros_like(vol, dtype=np.uint8) if len(nods) != 0: for nod in nods: cmask, cbbox, masks = consensus(nod, clevel=0.5) if np.sum(cmask) > 114: # filter the nodules with a radius < 3mm label[cbbox] = cmask.astype(np.float32) if np.sum(label) != 0: vol = np.transpose(vol, (2, 0, 1)) label = np.transpose(label, (2, 0, 1)) save_as_hdf5(vol, os.path.join(save_path, pid + '.hdf5'), 'image') save_as_hdf5(label, os.path.join(save_path, pid + '.hdf5'), 'label') assert list(np.unique(label)) == [0, 1] print('%s done !' % pid)