def get_train_holdout_files(model_type, holdout, train_percentage=80, frame_count=8): print("Get train/holdout files.") file_paths = glob.glob("resources/segmenter_traindata/" + "*_1.png") file_paths.sort() #pdb.set_trace() train_res = [] holdout_res = [] for index, file_path in enumerate(file_paths): file_name = ntpath.basename(file_path) overlay_path = file_path.replace("_1.png", "_o.png") train_set = False if "1.3.6.1.4" in file_name or "spie" in file_name or "TIME" in file_name: train_set = True else: patient_id = file_name.split("_")[0] if helpers.get_patient_fold(patient_id) % 3 != holdout: train_set = True if train_set: train_res.append((file_path, overlay_path)) else: holdout_res.append((file_path, overlay_path)) print("Train count: ", len(train_res), ", holdout count: ", len(holdout_res)) return train_res, holdout_res
def predict_patients(patients_dir, model_path, holdout, patient_predictions, model_type): model = get_unet(0.001) model.load_weights(model_path) for item_name in os.listdir(patients_dir): if not os.path.isdir(patients_dir + item_name): continue patient_id = item_name if holdout >= 0: patient_fold = helpers.get_patient_fold(patient_id, submission_set_neg=True) if patient_fold < 0: if holdout != 0: continue else: patient_fold %= 3 if patient_fold != holdout: continue # if "100953483028192176989979435275" not in patient_id: # continue print(patient_id) patient_dir = patients_dir + patient_id + "/" mass = 0 img_type = "_i" if model_type == "masses" else "_c" slices = glob.glob(patient_dir + "*" + img_type + ".png") if model_type == "emphysema": slices = slices[int(len(slices) / 2):] for img_path in slices: src_img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) src_img = cv2.resize(src_img, dsize=(settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE)) src_img = prepare_image_for_net(src_img) p = model.predict(src_img, batch_size=1) p[p < 0.5] = 0 mass += p.sum() p = p[0, :, :, 0] * 255 # cv2.imwrite(img_path.replace("_i.png", "_mass.png"), p) src_img = src_img.reshape( (settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE)) src_img *= 255 # src_img = cv2.cvtColor(src_img.astype(numpy.uint8), cv2.COLOR_GRAY2BGR) # p = cv2.cvtColor(p.astype(numpy.uint8), cv2.COLOR_GRAY2BGRA) src_img = cv2.addWeighted(p.astype(numpy.uint8), 0.2, src_img.astype(numpy.uint8), 1 - 0.2, 0) cv2.imwrite( img_path.replace(img_type + ".png", "_" + model_type + "o.png"), src_img) if mass > 1: print(model_type + ": ", mass) patient_predictions.append((patient_id, mass)) df = pandas.DataFrame(patient_predictions, columns=["patient_id", "prediction"]) df.to_csv(settings.BASE_DIR + model_type + "_predictions.csv", index=False)
def predict_patients(patients_dir, model_path, holdout, patient_predictions, model_type): model = get_unet(0.001) model.load_weights(model_path) for item_name in os.listdir(patients_dir): if not os.path.isdir(patients_dir + item_name): continue patient_id = item_name if holdout >= 0: patient_fold = helpers.get_patient_fold(patient_id, submission_set_neg=True) if patient_fold < 0: if holdout != 0: continue else: patient_fold %= 3 if patient_fold != holdout: continue # if "100953483028192176989979435275" not in patient_id: # continue print(patient_id) patient_dir = patients_dir + patient_id + "/" mass = 0 img_type = "_i" if model_type == "masses" else "_c" slices = glob.glob(patient_dir + "*" + img_type + ".png") if model_type == "emphysema": slices = slices[int(len(slices) / 2):] for img_path in slices: src_img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) src_img = cv2.resize(src_img, dsize=(settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE)) src_img = prepare_image_for_net(src_img) p = model.predict(src_img, batch_size=1) p[p < 0.5] = 0 mass += p.sum() p = p[0, :, :, 0] * 255 # cv2.imwrite(img_path.replace("_i.png", "_mass.png"), p) src_img = src_img.reshape((settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE)) src_img *= 255 # src_img = cv2.cvtColor(src_img.astype(numpy.uint8), cv2.COLOR_GRAY2BGR) # p = cv2.cvtColor(p.astype(numpy.uint8), cv2.COLOR_GRAY2BGRA) src_img = cv2.addWeighted(p.astype(numpy.uint8), 0.2, src_img.astype(numpy.uint8), 1 - 0.2, 0) cv2.imwrite(img_path.replace(img_type + ".png", "_" + model_type + "o.png"), src_img) if mass > 1: print(model_type + ": ", mass) patient_predictions.append((patient_id, mass)) df = pandas.DataFrame(patient_predictions, columns=["patient_id", "prediction"]) df.to_csv(settings.BASE_DIR + model_type + "_predictions.csv", index=False)
def get_train_holdout_files(model_type, holdout, train_percentage=80, frame_count=8): print("Get train/holdout files.") file_paths = glob.glob("resources/segmenter_traindata/" + "*_1.png") file_paths.sort() train_res = [] holdout_res = [] for index, file_path in enumerate(file_paths): file_name = ntpath.basename(file_path) overlay_path = file_path.replace("_1.png", "_o.png") train_set = False if "1.3.6.1.4" in file_name or "spie" in file_name or "TIME" in file_name: train_set = True else: patient_id = file_name.split("_")[0] if helpers.get_patient_fold(patient_id) % 3 != holdout: train_set = True if train_set: train_res.append((file_path, overlay_path)) else: holdout_res.append((file_path, overlay_path)) print("Train count: ", len(train_res), ", holdout count: ", len(holdout_res)) return train_res, holdout_res
def get_train_holdout_files(fold_count, train_percentage=80, logreg=True, ndsb3_holdout=0, manual_labels=True, full_luna_set=False): print("Get train/holdout files.") # pos_samples = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_pos/*.png") # LIDCのpositiveサンプル pos_samples = glob.glob( settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_lidc/*.png") print("Pos samples: ", len(pos_samples)) # LUNA16の手動positiveサンプル pos_samples_manual = glob.glob( settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_manual/*_pos.png") print("Pos samples manual: ", len(pos_samples_manual)) pos_samples += pos_samples_manual # シャッフル random.shuffle(pos_samples) # positiveの学習数 train_pos_count = int((len(pos_samples) * train_percentage) / 100) # 学習サンプル pos_samples_train = pos_samples[:train_pos_count] # 検証サンプル pos_samples_holdout = pos_samples[train_pos_count:] if full_luna_set: pos_samples_train += pos_samples_holdout if manual_labels: # 手動データについては検証しない? pos_samples_holdout = [] # NDSB3の手動サンプル ndsb3_list = glob.glob( settings.BASE_DIR_SSD + "generated_traindata/ndsb3_train_cubes_manual/*.png") print("Ndsb3 samples: ", len(ndsb3_list)) pos_samples_ndsb3_fold = [] pos_samples_ndsb3_holdout = [] ndsb3_pos = 0 ndsb3_neg = 0 ndsb3_pos_holdout = 0 ndsb3_neg_holdout = 0 if manual_labels: for file_path in ndsb3_list: file_name = ntpath.basename(file_path) parts = file_name.split("_") if int( parts[4] ) == 0 and parts[3] != "neg": # skip positive non-cancer-cases continue if fold_count == 3: if parts[3] == "neg": # skip negative cases continue patient_id = parts[1] patient_fold = helpers.get_patient_fold(patient_id) % fold_count if patient_fold == ndsb3_holdout: pos_samples_ndsb3_holdout.append(file_path) if parts[3] == "neg": ndsb3_neg_holdout += 1 else: ndsb3_pos_holdout += 1 else: pos_samples_ndsb3_fold.append(file_path) print("In fold: ", patient_id) if parts[3] == "neg": ndsb3_neg += 1 else: ndsb3_pos += 1 print(ndsb3_pos, " ndsb3 pos labels train") print(ndsb3_neg, " ndsb3 neg labels train") print(ndsb3_pos_holdout, " ndsb3 pos labels holdout") print(ndsb3_neg_holdout, " ndsb3 neg labels holdout") if manual_labels: for times_ndsb3 in range( 4 ): # make ndsb labels count 4 times just like in LIDC when 4 doctors annotated a nodule pos_samples_train += pos_samples_ndsb3_fold pos_samples_holdout += pos_samples_ndsb3_holdout neg_samples_edge = glob.glob( settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_edge.png") print("Edge samples: ", len(neg_samples_edge)) # neg_samples_white = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_auto/*_white.png") neg_samples_luna = glob.glob( settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_luna.png") print("Luna samples: ", len(neg_samples_luna)) # neg_samples = neg_samples_edge + neg_samples_white neg_samples = neg_samples_edge + neg_samples_luna random.shuffle(neg_samples) train_neg_count = int((len(neg_samples) * train_percentage) / 100) neg_samples_falsepos = [] for file_path in glob.glob( settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_falsepos.png"): neg_samples_falsepos.append(file_path) print("Falsepos LUNA count: ", len(neg_samples_falsepos)) neg_samples_train = neg_samples[:train_neg_count] neg_samples_train += neg_samples_falsepos + neg_samples_falsepos + neg_samples_falsepos neg_samples_holdout = neg_samples[train_neg_count:] if full_luna_set: neg_samples_train += neg_samples_holdout train_res = [] holdout_res = [] sets = [(train_res, pos_samples_train, neg_samples_train), (holdout_res, pos_samples_holdout, neg_samples_holdout)] for set_item in sets: pos_idx = 0 negs_per_pos = NEGS_PER_POS res = set_item[0] neg_samples = set_item[2] pos_samples = set_item[1] print("Pos", len(pos_samples)) ndsb3_pos = 0 ndsb3_neg = 0 for index, neg_sample_path in enumerate(neg_samples): # res.append(sample_path + "/") res.append((neg_sample_path, 0, 0)) if index % negs_per_pos == 0: pos_sample_path = pos_samples[pos_idx] file_name = ntpath.basename(pos_sample_path) parts = file_name.split("_") if parts[0].startswith("ndsb3manual"): if parts[3] == "pos": class_label = 1 # only take positive examples where we know there was a cancer.. cancer_label = int(parts[4]) assert cancer_label == 1 size_label = int(parts[5]) # print(parts[1], size_label) assert class_label == 1 if size_label < 1: print("huh ?") assert size_label >= 1 ndsb3_pos += 1 else: class_label = 0 size_label = 0 ndsb3_neg += 1 else: class_label = int(parts[-2]) size_label = int(parts[-3]) assert class_label == 1 assert parts[-1] == "pos.png" assert size_label >= 1 res.append((pos_sample_path, class_label, size_label)) pos_idx += 1 pos_idx %= len(pos_samples) print("ndsb2 pos: ", ndsb3_pos) print("ndsb2 neg: ", ndsb3_neg) print("Train count: ", len(train_res), ", holdout count: ", len(holdout_res)) return train_res, holdout_res
def get_train_holdout_files(fold_count, train_percentage=80, logreg=True, ndsb3_holdout=0, manual_labels=True, full_luna_set=False, local_patient_set=False): logger.info("Get train/holdout files.") # pos_samples = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_pos/*.png") pos_samples = glob.glob(settings.WORKING_DIR + "generated_traindata/luna16_train_cubes_lidc/*.png") logger.info("Pos samples: {0}".format(len(pos_samples))) pos_samples_manual = glob.glob(settings.WORKING_DIR + "generated_traindata/luna16_train_cubes_manual/*_pos.png") logger.info("Pos samples manual: {0}".format(len(pos_samples_manual))) pos_samples += pos_samples_manual random.shuffle(pos_samples) train_pos_count = int((len(pos_samples) * train_percentage) / 100) pos_samples_train = pos_samples[:train_pos_count] pos_samples_holdout = pos_samples[train_pos_count:] if full_luna_set: pos_samples_train += pos_samples_holdout if manual_labels: pos_samples_holdout = [] ndsb3_list = glob.glob(settings.WORKING_DIR+ "generated_traindata/ndsb3_train_cubes_manual/*.png") logger.info("Ndsb3 samples: {0} ".format(len(ndsb3_list))) pos_samples_ndsb3_fold = [] pos_samples_ndsb3_holdout = [] ndsb3_pos = 0 ndsb3_neg = 0 ndsb3_pos_holdout = 0 ndsb3_neg_holdout = 0 if manual_labels: for file_path in ndsb3_list: file_name = ntpath.basename(file_path) if int(parts[4]) == 0 and parts[3] != "neg": # skip positive non-cancer-cases continue if fold_count == 3: if parts[3] == "neg": # skip negative cases continue patient_id = parts[1] patient_fold = helpers.get_patient_fold(patient_id) % fold_count if patient_fold == ndsb3_holdout: logger.info("In holdout: {0}".format(patient_id)) pos_samples_ndsb3_holdout.append(file_path) if parts[3] == "neg": ndsb3_neg_holdout += 1 else: ndsb3_pos_holdout += 1 else: pos_samples_ndsb3_fold.append(file_path) logger.info("In fold: {0}".format(patient_id)) if parts[3] == "neg": ndsb3_neg += 1 else: ndsb3_pos += 1 logger.info("{0} ndsb3 pos labels train".format(ndsb3_pos)) logger.info("{0} ndsb3 neg labels train".format(ndsb3_neg)) logger.info("{0} ndsb3 pos labels holdout".format(ndsb3_pos_holdout)) logger.info("{0} ndsb3 neg labels holdout".format(ndsb3_neg_holdout)) pos_samples_hospital_train = [] pos_samples_hospital_holdout = [] if local_patient_set: logger.info("Including hospital cases...") hospital_list = glob.glob(settings.WORKING_DIR + "generated_traindata/hospital_train_cubes_manual/*.png") random.shuffle(hospital_list) train_hospital_count = int((len(hospital_list) * train_percentage) / 100) pos_samples_hospital_train = hospital_list[:train_hospital_count] pos_samples_hospital_holdout = hospital_list[train_hospital_count:] if manual_labels: for times_ndsb3 in range(4): # make ndsb labels count 4 times just like in LIDC when 4 doctors annotated a nodule pos_samples_train += pos_samples_ndsb3_fold pos_samples_holdout += pos_samples_ndsb3_holdout neg_samples_edge = glob.glob(settings.WORKING_DIR + "generated_traindata/luna16_train_cubes_auto/*_edge.png") logger.info("Edge samples: {0}".format(len(neg_samples_edge))) # neg_samples_white = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_auto/*_white.png") neg_samples_luna = glob.glob(settings.WORKING_DIR + "generated_traindata/luna16_train_cubes_auto/*_luna.png") logger.info("Luna samples: {0}".format(len(neg_samples_luna))) # neg_samples = neg_samples_edge + neg_samples_white neg_samples = neg_samples_edge + neg_samples_luna random.shuffle(neg_samples) train_neg_count = int((len(neg_samples) * train_percentage) / 100) neg_samples_falsepos = [] for file_path in glob.glob(settings.WORKING_DIR + "generated_traindata/luna16_train_cubes_auto/*_falsepos.png"): neg_samples_falsepos.append(file_path) logger.info("Falsepos LUNA count: {0}".format(len(neg_samples_falsepos))) neg_samples_train = neg_samples[:train_neg_count] neg_samples_train += neg_samples_falsepos + neg_samples_falsepos + neg_samples_falsepos neg_samples_holdout = neg_samples[train_neg_count:] if full_luna_set: neg_samples_train += neg_samples_holdout train_res = [] holdout_res = [] logger.info("Train positive samples: {0}".format(len(pos_samples_train))) logger.info("Train negative samples: {0}".format(len(neg_samples_train))) logger.info("Train hospital samples: {0}".format(len(pos_samples_hospital_train))) logger.info("Holdout positive samples: {0}".format(len(pos_samples_holdout))) logger.info("Holdout negative samples: {0}".format(len(neg_samples_holdout))) logger.info("Holdout hospital samples: {0}".format(len(pos_samples_hospital_holdout))) sets = [(train_res, pos_samples_train, neg_samples_train, pos_samples_hospital_train), (holdout_res, pos_samples_holdout, neg_samples_holdout, pos_samples_hospital_holdout)] for set_item in sets: pos_idx = 0 negs_per_pos = NEGS_PER_POS res = set_item[0] neg_samples = set_item[2] pos_samples = set_item[1] hospital_samples = set_item[3] logger.info("Pos: {0}".format(len(pos_samples))) ndsb3_pos = 0 ndsb3_neg = 0 for index, neg_sample_path in enumerate(neg_samples): # res.append(sample_path + "/") res.append((neg_sample_path, 0, 0)) if index % negs_per_pos == 0: pos_sample_path = pos_samples[pos_idx] file_name = ntpath.basename(pos_sample_path) parts = file_name.split("_") if parts[0].startswith("ndsb3manual"): if parts[3] == "pos": class_label = 1 # only take positive examples where we know there was a cancer.. cancer_label = int(parts[4]) assert cancer_label == 1 size_label = int(parts[5]) # logger.info(parts[1], size_label) assert class_label == 1 if size_label < 1: logger.info("huh ?") assert size_label >= 1 ndsb3_pos += 1 else: class_label = 0 size_label = 0 ndsb3_neg += 1 else: class_label = int(parts[-2]) size_label = int(parts[-3]) assert class_label == 1 assert parts[-1] == "pos.png" assert size_label >= 1 res.append((pos_sample_path, class_label, size_label)) pos_idx += 1 pos_idx %= len(pos_samples) if local_patient_set: for index, hospital_sample_path in enumerate(hospital_samples): file_name = os.path.basename(hospital_sample_path) parts = file_name.split("_") if parts[3] == "pos": class_label = 1 else: class_label = 0 size_label = int(parts[5]) if size_label < 1: logger.info("{0} nodule size < 1".format(file_name)) logger.info("Add sample {0} class: {1} size: {2}".format(hospital_sample_path, class_label, size_label)) res.append((hospital_sample_path, class_label, size_label)) logger.info("ndsb2 pos: {0}".format(ndsb3_pos)) logger.info("ndsb2 neg: {0}".format(ndsb3_neg)) logger.info("Train count: {0}, holdout count: {1} ".format(len(train_res), len(holdout_res))) return train_res, holdout_res
def predict_cubes(model_path, continue_job, only_patient_id=None, luna16=False, magnification=1, flip=False, train_data=True, holdout_no=-1, ext_name="", fold_count=2): if luna16: dst_dir = settings.LUNA_NODULE_DETECTION_DIR else: dst_dir = settings.NDSB3_NODULE_DETECTION_DIR if not os.path.exists(dst_dir): os.makedirs(dst_dir) holdout_ext = "" # if holdout_no is not None: # holdout_ext = "_h" + str(holdout_no) if holdout_no >= 0 else "" flip_ext = "" if flip: flip_ext = "_flip" dst_dir += "predictions" + str(int( magnification * 10)) + holdout_ext + flip_ext + "_" + ext_name + "/" if not os.path.exists(dst_dir): os.makedirs(dst_dir) sw = helpers.Stopwatch.start_new() model = step2_train_nodule_detector.get_net(input_shape=(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE, 1), load_weight_path=model_path) if not luna16: if train_data: labels_df = pandas.read_csv("resources/stage1_labels.csv") labels_df.set_index(["id"], inplace=True) else: #labels_df = pandas.read_csv("resources/stage2_sample_submission.csv") labels_df = pandas.read_csv("resources/tc_sample_submission.csv") labels_df.set_index(["id"], inplace=True) patient_ids = [] for file_name in os.listdir(settings.NDSB3_EXTRACTED_IMAGE_DIR): if not os.path.isdir(settings.NDSB3_EXTRACTED_IMAGE_DIR + file_name): continue patient_ids.append(file_name) all_predictions_csv = [] for patient_index, patient_id in enumerate(reversed(patient_ids)): if not luna16: if patient_id not in labels_df.index: continue if "metadata" in patient_id: continue if only_patient_id is not None and only_patient_id != patient_id: continue if holdout_no is not None and train_data: patient_fold = helpers.get_patient_fold(patient_id) patient_fold %= fold_count if patient_fold != holdout_no: continue print(patient_index, ": ", patient_id) csv_target_path = dst_dir + patient_id + ".csv" if continue_job and only_patient_id is None: if os.path.exists(csv_target_path): continue patient_img = helpers.load_patient_images( patient_id, settings.NDSB3_EXTRACTED_IMAGE_DIR, "*_i.png", []) if magnification != 1: patient_img = helpers.rescale_patient_images( patient_img, (1, 1, 1), magnification) patient_mask = helpers.load_patient_images( patient_id, settings.NDSB3_EXTRACTED_IMAGE_DIR, "*_m.png", []) if magnification != 1: patient_mask = helpers.rescale_patient_images(patient_mask, (1, 1, 1), magnification, is_mask_image=True) # patient_img = patient_img[:, ::-1, :] # patient_mask = patient_mask[:, ::-1, :] step = PREDICT_STEP CROP_SIZE = CUBE_SIZE # CROP_SIZE = 48 predict_volume_shape_list = [0, 0, 0] for dim in range(3): dim_indent = 0 while dim_indent + CROP_SIZE < patient_img.shape[dim]: predict_volume_shape_list[dim] += 1 dim_indent += step predict_volume_shape = (predict_volume_shape_list[0], predict_volume_shape_list[1], predict_volume_shape_list[2]) predict_volume = numpy.zeros(shape=predict_volume_shape, dtype=float) print("Predict volume shape: ", predict_volume.shape) done_count = 0 skipped_count = 0 batch_size = 128 batch_list = [] batch_list_coords = [] patient_predictions_csv = [] cube_img = None annotation_index = 0 for z in range(0, predict_volume_shape[0]): for y in range(0, predict_volume_shape[1]): for x in range(0, predict_volume_shape[2]): #if cube_img is None: cube_img = patient_img[z * step:z * step + CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step + CROP_SIZE] cube_mask = patient_mask[z * step:z * step + CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step + CROP_SIZE] if cube_mask.sum() < 2000: skipped_count += 1 else: if flip: cube_img = cube_img[:, :, ::-1] if CROP_SIZE != CUBE_SIZE: cube_img = helpers.rescale_patient_images2( cube_img, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) # helpers.save_cube_img("c:/tmp/cube.png", cube_img, 8, 4) # cube_mask = helpers.rescale_patient_images2(cube_mask, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) img_prep = prepare_image_for_net3D(cube_img) batch_list.append(img_prep) batch_list_coords.append((z, y, x)) if len(batch_list) % batch_size == 0: batch_data = numpy.vstack(batch_list) p = model.predict(batch_data, batch_size=batch_size) for i in range(len(p[0])): p_z = batch_list_coords[i][0] p_y = batch_list_coords[i][1] p_x = batch_list_coords[i][2] nodule_chance = p[0][i][0] predict_volume[p_z, p_y, p_x] = nodule_chance if nodule_chance > P_TH: p_z = p_z * step + CROP_SIZE / 2 p_y = p_y * step + CROP_SIZE / 2 p_x = p_x * step + CROP_SIZE / 2 p_z_perc = round( p_z / patient_img.shape[0], 4) p_y_perc = round( p_y / patient_img.shape[1], 4) p_x_perc = round( p_x / patient_img.shape[2], 4) diameter_mm = round(p[1][i][0], 4) # diameter_perc = round(2 * step / patient_img.shape[2], 4) diameter_perc = round( 2 * step / patient_img.shape[2], 4) diameter_perc = round( diameter_mm / patient_img.shape[2], 4) nodule_chance = round(nodule_chance, 4) #patient_predictions_csv_line = [annotation_index, p_x_perc, p_y_perc, p_z_perc, diameter_perc, nodule_chance, diameter_mm] patient_predictions_csv_line = [ annotation_index, p_x_perc, p_y_perc, p_z_perc, diameter_perc, nodule_chance, diameter_mm, p_x, p_y, p_z ] #patient_predictions_csv_line = [annotation_index, p_x, p_y, p_z, diameter_perc, nodule_chance, diameter_mm] patient_predictions_csv.append( patient_predictions_csv_line) all_predictions_csv.append( [patient_id] + patient_predictions_csv_line) annotation_index += 1 batch_list = [] batch_list_coords = [] done_count += 1 if done_count % 10000 == 0: print("Done: ", done_count, " skipped:", skipped_count) df = pandas.DataFrame(patient_predictions_csv, columns=[ "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm", "abs_x", "abs_y", "abs_z" ]) filter_patient_nodules_predictions(df, patient_id, CROP_SIZE * magnification) df.to_csv(csv_target_path, index=False) # cols = ["anno_index", "nodule_chance", "diamete_mm"] + ["f" + str(i) for i in range(64)] # df_features = pandas.DataFrame(patient_features_csv, columns=cols) # for index, row in df.iterrows(): # if row["diameter_mm"] < 0: # print("Dropping") # anno_index = row["anno_index"] # df_features.drop(df_features[df_features["anno_index"] == anno_index].index, inplace=True) # # df_features.to_csv(csv_target_path_features, index=False) # df = pandas.DataFrame(all_predictions_csv, columns=["patient_id", "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm"]) # df.to_csv("c:/tmp/tmp2.csv", index=False) print(predict_volume.mean()) print("Done in : ", sw.get_elapsed_seconds(), " seconds")
def get_train_holdout_files(fold_count, train_percentage=80, logreg=True, ndsb3_holdout=0, manual_labels=True, full_luna_set=False): print("Get train/holdout files.") # pos_samples = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_pos/*.png") pos_samples = glob.glob(settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_lidc/*.png") print("Pos samples: ", len(pos_samples)) pos_samples_manual = glob.glob(settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_manual/*_pos.png") print("Pos samples manual: ", len(pos_samples_manual)) pos_samples += pos_samples_manual random.shuffle(pos_samples) train_pos_count = int((len(pos_samples) * train_percentage) / 100) pos_samples_train = pos_samples[:train_pos_count] pos_samples_holdout = pos_samples[train_pos_count:] if full_luna_set: pos_samples_train += pos_samples_holdout if manual_labels: pos_samples_holdout = [] ndsb3_list = glob.glob(settings.BASE_DIR_SSD + "generated_traindata/ndsb3_train_cubes_manual/*.png") print("Ndsb3 samples: ", len(ndsb3_list)) pos_samples_ndsb3_fold = [] pos_samples_ndsb3_holdout = [] ndsb3_pos = 0 ndsb3_neg = 0 ndsb3_pos_holdout = 0 ndsb3_neg_holdout = 0 if manual_labels: for file_path in ndsb3_list: file_name = ntpath.basename(file_path) parts = file_name.split("_") if int(parts[4]) == 0 and parts[3] != "neg": # skip positive non-cancer-cases continue if fold_count == 3: if parts[3] == "neg": # skip negative cases continue patient_id = parts[1] patient_fold = helpers.get_patient_fold(patient_id) % fold_count if patient_fold == ndsb3_holdout: pos_samples_ndsb3_holdout.append(file_path) if parts[3] == "neg": ndsb3_neg_holdout += 1 else: ndsb3_pos_holdout += 1 else: pos_samples_ndsb3_fold.append(file_path) print("In fold: ", patient_id) if parts[3] == "neg": ndsb3_neg += 1 else: ndsb3_pos += 1 print(ndsb3_pos, " ndsb3 pos labels train") print(ndsb3_neg, " ndsb3 neg labels train") print(ndsb3_pos_holdout, " ndsb3 pos labels holdout") print(ndsb3_neg_holdout, " ndsb3 neg labels holdout") if manual_labels: for times_ndsb3 in range(4): # make ndsb labels count 4 times just like in LIDC when 4 doctors annotated a nodule pos_samples_train += pos_samples_ndsb3_fold pos_samples_holdout += pos_samples_ndsb3_holdout neg_samples_edge = glob.glob(settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_edge.png") print("Edge samples: ", len(neg_samples_edge)) # neg_samples_white = glob.glob(settings.BASE_DIR_SSD + "luna16_train_cubes_auto/*_white.png") neg_samples_luna = glob.glob(settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_luna.png") print("Luna samples: ", len(neg_samples_luna)) # neg_samples = neg_samples_edge + neg_samples_white neg_samples = neg_samples_edge + neg_samples_luna random.shuffle(neg_samples) train_neg_count = int((len(neg_samples) * train_percentage) / 100) neg_samples_falsepos = [] for file_path in glob.glob(settings.BASE_DIR_SSD + "generated_traindata/luna16_train_cubes_auto/*_falsepos.png"): neg_samples_falsepos.append(file_path) print("Falsepos LUNA count: ", len(neg_samples_falsepos)) neg_samples_train = neg_samples[:train_neg_count] neg_samples_train += neg_samples_falsepos + neg_samples_falsepos + neg_samples_falsepos neg_samples_holdout = neg_samples[train_neg_count:] if full_luna_set: neg_samples_train += neg_samples_holdout train_res = [] holdout_res = [] sets = [(train_res, pos_samples_train, neg_samples_train), (holdout_res, pos_samples_holdout, neg_samples_holdout)] for set_item in sets: pos_idx = 0 negs_per_pos = NEGS_PER_POS res = set_item[0] neg_samples = set_item[2] pos_samples = set_item[1] print("Pos", len(pos_samples)) ndsb3_pos = 0 ndsb3_neg = 0 for index, neg_sample_path in enumerate(neg_samples): # res.append(sample_path + "/") res.append((neg_sample_path, 0, 0)) if index % negs_per_pos == 0: pos_sample_path = pos_samples[pos_idx] file_name = ntpath.basename(pos_sample_path) parts = file_name.split("_") if parts[0].startswith("ndsb3manual"): if parts[3] == "pos": class_label = 1 # only take positive examples where we know there was a cancer.. cancer_label = int(parts[4]) assert cancer_label == 1 size_label = int(parts[5]) # print(parts[1], size_label) assert class_label == 1 if size_label < 1: print("huh ?") assert size_label >= 1 ndsb3_pos += 1 else: class_label = 0 size_label = 0 ndsb3_neg += 1 else: class_label = int(parts[-2]) size_label = int(parts[-3]) assert class_label == 1 assert parts[-1] == "pos.png" assert size_label >= 1 res.append((pos_sample_path, class_label, size_label)) pos_idx += 1 pos_idx %= len(pos_samples) print("ndsb2 pos: ", ndsb3_pos) print("ndsb2 neg: ", ndsb3_neg) print("Train count: ", len(train_res), ", holdout count: ", len(holdout_res)) return train_res, holdout_res
def predict_cubes(path, model_path, magnification=1, holdout_no=-1, ext_name="", fold_count=2): dst_dir = settings.LUNA_NODULE_DETECTION_DIR if not os.path.exists(dst_dir): os.makedirs(dst_dir) holdout_ext = "" dst_dir += "predictions" + str(int( magnification * 10)) + holdout_ext + "_" + ext_name + "/" if not os.path.exists(dst_dir): os.makedirs(dst_dir) sw = helpers.Stopwatch.start_new() model = step2_train_nodule_detector.get_net(input_shape=(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE, 1), load_weight_path=model_path) patient_id = path all_predictions_csv = [] if holdout_no is not None: patient_fold = helpers.get_patient_fold(patient_id) patient_fold %= fold_count print(": ", patient_id) csv_target_path = dst_dir + patient_id + ".csv" print(patient_id) try: patient_img = helpers.load_patient_images(patient_id + '_Preprocessed', '', "*_i.png", []) except: print('Please Re-Process the dicom file again') if magnification != 1: patient_img = helpers.rescale_patient_images(patient_img, (1, 1, 1), magnification) patient_mask = helpers.load_patient_images(patient_id + '_Preprocessed', '', "*_m.png", []) if magnification != 1: patient_mask = helpers.rescale_patient_images(patient_mask, (1, 1, 1), magnification, is_mask_image=True) # patient_img = patient_img[:, ::-1, :] # patient_mask = patient_mask[:, ::-1, :] step = PREDICT_STEP CROP_SIZE = CUBE_SIZE # CROP_SIZE = 48 predict_volume_shape_list = [0, 0, 0] for dim in range(3): dim_indent = 0 while dim_indent + CROP_SIZE < patient_img.shape[dim]: predict_volume_shape_list[dim] += 1 dim_indent += step predict_volume_shape = (predict_volume_shape_list[0], predict_volume_shape_list[1], predict_volume_shape_list[2]) predict_volume = numpy.zeros(shape=predict_volume_shape, dtype=float) print("Predict volume shape: ", predict_volume.shape) done_count = 0 skipped_count = 0 batch_size = 128 batch_list = [] batch_list_coords = [] patient_predictions_csv = [] cube_img = None annotation_index = 0 for z in range(0, predict_volume_shape[0]): for y in range(0, predict_volume_shape[1]): for x in range(0, predict_volume_shape[2]): #if cube_img is None: cube_img = patient_img[z * step:z * step + CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step + CROP_SIZE] cube_mask = patient_mask[z * step:z * step + CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step + CROP_SIZE] if cube_mask.sum() < 2000: skipped_count += 1 if CROP_SIZE != CUBE_SIZE: cube_img = helpers.rescale_patient_images2( cube_img, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) # helpers.save_cube_img("c:/tmp/cube.png", cube_img, 8, 4) # cube_mask = helpers.rescale_patient_images2(cube_mask, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) img_prep = prepare_image_for_net3D(cube_img) batch_list.append(img_prep) batch_list_coords.append((z, y, x)) if len(batch_list) % batch_size == 0: batch_data = numpy.vstack(batch_list) p = model.predict(batch_data, batch_size=batch_size) for i in range(len(p[0])): p_z = batch_list_coords[i][0] p_y = batch_list_coords[i][1] p_x = batch_list_coords[i][2] nodule_chance = p[0][i][0] predict_volume[p_z, p_y, p_x] = nodule_chance if nodule_chance > P_TH: p_z = p_z * step + CROP_SIZE / 2 p_y = p_y * step + CROP_SIZE / 2 p_x = p_x * step + CROP_SIZE / 2 p_z_perc = round(p_z / patient_img.shape[0], 4) p_y_perc = round(p_y / patient_img.shape[1], 4) p_x_perc = round(p_x / patient_img.shape[2], 4) diameter_mm = round(p[1][i][0], 4) # diameter_perc = round(2 * step / patient_img.shape[2], 4) diameter_perc = round( 2 * step / patient_img.shape[2], 4) diameter_perc = round( diameter_mm / patient_img.shape[2], 4) nodule_chance = round(nodule_chance, 4) patient_predictions_csv_line = [ annotation_index, p_x_perc, p_y_perc, p_z_perc, diameter_perc, nodule_chance, diameter_mm ] patient_predictions_csv.append( patient_predictions_csv_line) all_predictions_csv.append( [patient_id] + patient_predictions_csv_line) annotation_index += 1 batch_list = [] batch_list_coords = [] done_count += 1 if done_count % 10000 == 0: print("Done: ", done_count, " skipped:", skipped_count) df = pandas.DataFrame(patient_predictions_csv, columns=[ "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm" ]) print("Started Filtering") print(all_predictions_csv) #print(batch_data) filter_patient_nodules_predictions(df, patient_id, CROP_SIZE * magnification) df.to_csv(csv_target_path, index=False) # cols = ["anno_index", "nodule_chance", "diamete_mm"] + ["f" + str(i) for i in range(64)] # df_features = pandas.DataFrame(patient_features_csv, columns=cols) # for index, row in df.iterrows(): # if row["diameter_mm"] < 0: # print("Dropping") # anno_index = row["anno_index"] # df_features.drop(df_features[df_features["anno_index"] == anno_index].index, inplace=True) # # df_features.to_csv(csv_target_path_features, index=False) # df = pandas.DataFrame(all_predictions_csv, columns=["patient_id", "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm"]) # df.to_csv("c:/tmp/tmp2.csv", index=False) print(predict_volume.mean()) print("Done in : ", sw.get_elapsed_seconds(), " seconds")
def predict_cubes(model_path, continue_job, only_patient_id=None, luna16=False, magnification=1, flip=False, train_data=True, holdout_no=-1, ext_name="", fold_count=2): if luna16: dst_dir = settings.LUNA_NODULE_DETECTION_DIR else: dst_dir = settings.NDSB3_NODULE_DETECTION_DIR if not os.path.exists(dst_dir): os.makedirs(dst_dir) holdout_ext = "" # if holdout_no is not None: # holdout_ext = "_h" + str(holdout_no) if holdout_no >= 0 else "" flip_ext = "" if flip: flip_ext = "_flip" dst_dir += "predictions" + str(int(magnification * 10)) + holdout_ext + flip_ext + "_" + ext_name + "/" if not os.path.exists(dst_dir): os.makedirs(dst_dir) sw = helpers.Stopwatch.start_new() model = step2_train_nodule_detector.get_net(input_shape=(CUBE_SIZE, CUBE_SIZE, CUBE_SIZE, 1), load_weight_path=model_path) if not luna16: if train_data: labels_df = pandas.read_csv("resources/stage1_labels.csv") labels_df.set_index(["id"], inplace=True) else: labels_df = pandas.read_csv("resources/stage2_sample_submission.csv") labels_df.set_index(["id"], inplace=True) patient_ids = [] for file_name in os.listdir(settings.NDSB3_EXTRACTED_IMAGE_DIR): if not os.path.isdir(settings.NDSB3_EXTRACTED_IMAGE_DIR + file_name): continue patient_ids.append(file_name) all_predictions_csv = [] for patient_index, patient_id in enumerate(reversed(patient_ids)): if not luna16: if patient_id not in labels_df.index: continue if "metadata" in patient_id: continue if only_patient_id is not None and only_patient_id != patient_id: continue if holdout_no is not None and train_data: patient_fold = helpers.get_patient_fold(patient_id) patient_fold %= fold_count if patient_fold != holdout_no: continue print(patient_index, ": ", patient_id) csv_target_path = dst_dir + patient_id + ".csv" if continue_job and only_patient_id is None: if os.path.exists(csv_target_path): continue patient_img = helpers.load_patient_images(patient_id, settings.NDSB3_EXTRACTED_IMAGE_DIR, "*_i.png", []) if magnification != 1: patient_img = helpers.rescale_patient_images(patient_img, (1, 1, 1), magnification) patient_mask = helpers.load_patient_images(patient_id, settings.NDSB3_EXTRACTED_IMAGE_DIR, "*_m.png", []) if magnification != 1: patient_mask = helpers.rescale_patient_images(patient_mask, (1, 1, 1), magnification, is_mask_image=True) # patient_img = patient_img[:, ::-1, :] # patient_mask = patient_mask[:, ::-1, :] step = PREDICT_STEP CROP_SIZE = CUBE_SIZE # CROP_SIZE = 48 predict_volume_shape_list = [0, 0, 0] for dim in range(3): dim_indent = 0 while dim_indent + CROP_SIZE < patient_img.shape[dim]: predict_volume_shape_list[dim] += 1 dim_indent += step predict_volume_shape = (predict_volume_shape_list[0], predict_volume_shape_list[1], predict_volume_shape_list[2]) predict_volume = numpy.zeros(shape=predict_volume_shape, dtype=float) print("Predict volume shape: ", predict_volume.shape) done_count = 0 skipped_count = 0 batch_size = 128 batch_list = [] batch_list_coords = [] patient_predictions_csv = [] cube_img = None annotation_index = 0 for z in range(0, predict_volume_shape[0]): for y in range(0, predict_volume_shape[1]): for x in range(0, predict_volume_shape[2]): #if cube_img is None: cube_img = patient_img[z * step:z * step+CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step+CROP_SIZE] cube_mask = patient_mask[z * step:z * step+CROP_SIZE, y * step:y * step + CROP_SIZE, x * step:x * step+CROP_SIZE] if cube_mask.sum() < 2000: skipped_count += 1 else: if flip: cube_img = cube_img[:, :, ::-1] if CROP_SIZE != CUBE_SIZE: cube_img = helpers.rescale_patient_images2(cube_img, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) # helpers.save_cube_img("c:/tmp/cube.png", cube_img, 8, 4) # cube_mask = helpers.rescale_patient_images2(cube_mask, (CUBE_SIZE, CUBE_SIZE, CUBE_SIZE)) img_prep = prepare_image_for_net3D(cube_img) batch_list.append(img_prep) batch_list_coords.append((z, y, x)) if len(batch_list) % batch_size == 0: batch_data = numpy.vstack(batch_list) p = model.predict(batch_data, batch_size=batch_size) for i in range(len(p[0])): p_z = batch_list_coords[i][0] p_y = batch_list_coords[i][1] p_x = batch_list_coords[i][2] nodule_chance = p[0][i][0] predict_volume[p_z, p_y, p_x] = nodule_chance if nodule_chance > P_TH: p_z = p_z * step + CROP_SIZE / 2 p_y = p_y * step + CROP_SIZE / 2 p_x = p_x * step + CROP_SIZE / 2 p_z_perc = round(p_z / patient_img.shape[0], 4) p_y_perc = round(p_y / patient_img.shape[1], 4) p_x_perc = round(p_x / patient_img.shape[2], 4) diameter_mm = round(p[1][i][0], 4) # diameter_perc = round(2 * step / patient_img.shape[2], 4) diameter_perc = round(2 * step / patient_img.shape[2], 4) diameter_perc = round(diameter_mm / patient_img.shape[2], 4) nodule_chance = round(nodule_chance, 4) patient_predictions_csv_line = [annotation_index, p_x_perc, p_y_perc, p_z_perc, diameter_perc, nodule_chance, diameter_mm] patient_predictions_csv.append(patient_predictions_csv_line) all_predictions_csv.append([patient_id] + patient_predictions_csv_line) annotation_index += 1 batch_list = [] batch_list_coords = [] done_count += 1 if done_count % 10000 == 0: print("Done: ", done_count, " skipped:", skipped_count) df = pandas.DataFrame(patient_predictions_csv, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm"]) filter_patient_nodules_predictions(df, patient_id, CROP_SIZE * magnification) df.to_csv(csv_target_path, index=False) # cols = ["anno_index", "nodule_chance", "diamete_mm"] + ["f" + str(i) for i in range(64)] # df_features = pandas.DataFrame(patient_features_csv, columns=cols) # for index, row in df.iterrows(): # if row["diameter_mm"] < 0: # print("Dropping") # anno_index = row["anno_index"] # df_features.drop(df_features[df_features["anno_index"] == anno_index].index, inplace=True) # # df_features.to_csv(csv_target_path_features, index=False) # df = pandas.DataFrame(all_predictions_csv, columns=["patient_id", "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "nodule_chance", "diameter_mm"]) # df.to_csv("c:/tmp/tmp2.csv", index=False) print(predict_volume.mean()) print("Done in : ", sw.get_elapsed_seconds(), " seconds")