def download_images_meta(api: ISICApi, images_info: list, from_idx: int, num: int, output: list, thread_num: int): for image in images_info[from_idx:from_idx + num]: time = datetime.now() image_detail: dict = api.getJson(f'image/{image["_id"]}') image_segmentation_data = api.getJson( f'segmentation?imageId={image["_id"]}') image_detail.update({'segmentation': image_segmentation_data}) output.append(image_detail) debug( f'Thread#{thread_num} Fetching of images details: {len(output)} from {num}\t' f'({round(len(output) / num * 100, 1)}%)\t' f'time remaining: {(datetime.now() - time) * (num - len(output))}')
def download_images_info(api: ISICApi) -> list: all_images = list() images_per_time = 30000 images_downloaded = 0 info(f'Fetching of images info started') while True: images: list = api.getJson( f'image?limit={images_per_time}&offset={images_downloaded}&sort=name' ) images_received = len(images) images_downloaded += images_received all_images.extend(images) info(f'Fetching of images info\tDownloaded: {images_downloaded}') if images_received != images_per_time: break return all_images
def download_images(api: ISICApi, images_info: list, from_idx: int, num: int, path: str, thread_num: int): downloaded = 0 for image in images_info[from_idx:from_idx + num]: image_file_output_path = os.path.join(path, f'{image["_id"]}.jpg') time = datetime.now() image_file_resp = api.get(f'image/{image["_id"]}/download') image_file_resp.raise_for_status() with open(image_file_output_path, 'wb') as imageFileOutputStream: for chunk in image_file_resp: imageFileOutputStream.write(chunk) downloaded += 1 debug( f'Thread#{thread_num} Fetching of images details: {downloaded} from {num}\t' f'({round(downloaded / num * 100, 1)}%)\t' f'time remaining: {(datetime.now() - time) * (num - downloaded)}')
def download_segmentation(api: ISICApi, images_info: list, from_idx: int, num: int, path: str, thread_num: int): downloaded = 0 for image in images_info[from_idx:from_idx + num]: time = datetime.now() for segmentation in image['segmentation']: segmentation_file_output_path = os.path.join( path, f'{segmentation["_id"]}.jpg') if os.path.exists(segmentation_file_output_path): continue image_file_resp = api.get( f'segmentation/{segmentation["_id"]}/mask') image_file_resp.raise_for_status() with open(segmentation_file_output_path, 'wb') as imageFileOutputStream: for chunk in image_file_resp: imageFileOutputStream.write(chunk) downloaded += 1 debug( f'Thread#{thread_num} Fetching of images details: {downloaded} from {num}\t' f'({round(downloaded / num * 100, 1)}%)\t' f'time remaining: {(datetime.now() - time) * (num - downloaded)}')
from isic_api import ISICApi import os import json import csv from tqdm import tqdm with open('config.json') as json_file: data = json.load(json_file) api = ISICApi(username=data["user"], password=data["pw"]) data_path = data["data_folder"] num_imgs = data["num_imgs"] if not os.path.exists(data_path): os.makedirs(data_path) imageList = api.getJson('image?limit=' + str(num_imgs) +'&offset=0&sort=name') #%% print('Fetching metadata for %s images' % len(imageList)) imageDetails = [] for image in tqdm(imageList): # Fetch the full image details imageDetail = api.getJson('image/%s' % image['_id']) imageDetails.append(imageDetail) # Determine the union of all image metadata fields metadataFields = set( field for imageDetail in imageDetails for field in imageDetail['meta']['clinical'].keys() )
from isic_api import ISICApi import os import json with open('config.json') as json_file: data = json.load(json_file) api = ISICApi(username=data["user"], password=data["pw"]) data_path = data["data_folder"] num_imgs = data["num_imgs"] #%% savePath = os.path.join(data_path, 'raw') if not os.path.exists(savePath): os.makedirs(savePath) start_offset = 0 #%% for i in range(int(num_imgs / 50) + 1): imageList = api.getJson('image?limit=50&offset=' + str(start_offset) + '&sort=name') print('Downloading %s images' % len(imageList)) for image in imageList: print(image['_id']) imageFileResp = api.get('image/%s/download' % image['_id']) imageFileResp.raise_for_status() imageFileOutputPath = os.path.join(savePath, '%s.jpg' % image['name']) with open(imageFileOutputPath, 'wb') as imageFileOutputStream: for chunk in imageFileResp:
def main(offset, count, meta=True): api = ISICApi() savePath = '../../ISICArchive/' if not os.path.exists(savePath): os.makedirs(savePath) imageList = api.getJson(f'image?limit={count}&offset={offset}&sort=name') i = count - 1 if meta: print('Fetching metadata for %s images' % len(imageList)) imageDetails = [] for ind, image in enumerate(imageList): print(' ', image['name']) # Fetch the full image details try: imageDetail = api.getJson('image/%s' % image['_id']) imageDetails.append(imageDetail) except requests.exceptions.ConnectionError: imageList = api.getJson( f'image?limit={count}&offset={offset}&sort=name') # i = ind # break # Determine the union of all image metadata fields metadataFields = set( field for imageDetail in imageDetails for field in imageDetail['meta']['clinical'].keys()) metadataFields = ['isic_id'] + sorted(metadataFields) # Write the metadata to a CSV outputFileName = f"metadata_{offset}_{offset+i}" print('Writing metadata to CSV: %s' % outputFileName + '.csv') with open(savePath + outputFileName + '.csv', 'w') as outputStream: csvWriter = csv.DictWriter(outputStream, metadataFields) csvWriter.writeheader() for imageDetail in imageDetails: rowDict = imageDetail['meta']['clinical'].copy() rowDict['isic_id'] = imageDetail['name'] csvWriter.writerow(rowDict) print('Downloading %s images' % len(imageList)) imageDetails = [] for ind, image in enumerate(imageList): if ind > i: break print(image['name']) try: imageFileResp = api.get('image/%s/download' % image['_id']) imageFileResp.raise_for_status() imageFileOutputPath = os.path.join(savePath, '%s.jpg' % image['name']) with open(imageFileOutputPath, 'wb') as imageFileOutputStream: for chunk in imageFileResp: imageFileOutputStream.write(chunk) except requests.exceptions.ConnectionError: # imageList = api.getJson( # f'image?limit={count-ind}&offset={offset+ind}&sort=name') print(ind, "FAILED.") break
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--account_user', required=True, help= "Create account at https://www.isic-archive.com/#!/topWithHeader/wideContentTop/main" ) parser.add_argument('--account_password', required=True) parser.add_argument('--out_dir', required=True, help="directory where dataset will be stored") options = parser.parse_args() ### API authentication api = ISICApi(username=options.account_user, password=options.account_password) ### get list of images imageList = api.getJson('image?limit=100000&offset=0&sort=name') print('Got list of {} images'.format(len(imageList))) ### download images and save metadata out_dir = options.out_dir for subfolder in ['images', 'dataset', 'meta']: try: mkdir(join(out_dir, subfolder)) except FileExistsError: print('{} already exists'.format(subfolder)) for subfolder in ['case', 'control']: try: mkdir(join(out_dir, 'dataset', subfolder)) except FileExistsError: print('{} already exists'.format(subfolder)) ### download images pool = mp.Pool(processes=mp.cpu_count()) f_list = [] for image in tqdm(imageList, desc="downloading dataset"): f_list.append( pool.apply_async( download_image(image, join(out_dir, 'images'), api))) ### get metadata of images # meta_data = load_meta_data(imageList, api) # meta_data.to_pickle(join(out_dir, 'meta', 'metadata.pkl')) # meta_data.to_csv(join(out_dir, 'meta', 'metadata.csv'), index=False) meta_data = pd.read_pickle(join(out_dir, 'meta', 'metadata.pkl')) ### Identify malignant images and download 1x malignant and 2x benign images malignant = meta_data[meta_data.meta_clinical_benign_malignant == 'malignant']._id.values benign = meta_data[meta_data.meta_clinical_benign_malignant == 'benign']._id.values np.random.seed(0) benign_sample = np.random.choice(benign, 2 * malignant.shape[0], replace=False) accept_ids = np.concatenate([malignant, benign_sample]) ### get list of images to be used as case/controls imageList = [var for var in imageList if var['_id'] in accept_ids] for image in tqdm(imageList, desc="splitting images in case/control groups"): f_name = "%s.jpg" % image['name'] f_path = join(out_dir, 'images', f_name) is_malignant = True if image['_id'] in malignant else False save_to = join(out_dir, 'dataset', 'case' if is_malignant else 'control') copyfile(f_path, join(save_to, f_name))
import json import csv from io import BytesIO import pandas as pd from isic_api import ISICApi from io import StringIO from PIL import Image, ImageDraw, ImageFont import os from pandas.io.json import json_normalize import urllib import os #9841 - 9920 ## ISIC_557 - 9867 api = ISICApi() savePath = 'dataset_images/' if not os.path.exists(savePath): os.makedirs(savePath) data = pd.read_csv('metadata.csv') id_data = pd.read_csv('name_id.csv') diseases = set() for i in range(len(data)): diseases.add(data.loc[i]['diagnosis']) # Create folders diseases for diagnosis in diseases:
def main(): username = sys.argv[1] password = sys.argv[2] info(f'Username: {username}\tPassword: {password}') api = ISICApi(username=username, password=password) if not os.path.exists(Config.WORKSPACE_PATH): os.mkdir(Config.WORKSPACE_PATH) path_to_images_meta = os.path.join(Config.WORKSPACE_PATH, Config.IMAGES_META) if not os.path.exists(path_to_images_meta): all_images = download_images_info(api) outputs = list() for _ in range(Config.NUM_THREADS): outputs.append(list()) threads = list() for thread_idx in range(Config.NUM_THREADS): from_idx = (thread_idx + 0) * len(all_images) // Config.NUM_THREADS to_idx = (thread_idx + 1) * len(all_images) // Config.NUM_THREADS num_images = to_idx - from_idx thread = Thread(target=download_images_meta, args=(api, all_images, from_idx, num_images, outputs[thread_idx], thread_idx)) thread.setDaemon(True) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() images_meta = list() for output in outputs: images_meta.extend(output) if len(images_meta) != 0: with open(path_to_images_meta, "w") as write_file: json.dump(images_meta, write_file, indent=4) else: with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) info(f'Number of images before script execution: {len(images_meta)}') if False: segmentation_path = os.path.join(Config.WORKSPACE_PATH, Config.IMAGES_PATH) if not os.path.exists(segmentation_path): os.mkdir(segmentation_path) images_meta = list() for image in images_meta: if not os.path.exists( os.path.join(segmentation_path, f'{image["_id"]}.jpg')): images_meta.append(image) threads = list() for thread_idx in range(Config.NUM_THREADS): from_idx = (thread_idx + 0) * len(images_meta) // Config.NUM_THREADS to_idx = (thread_idx + 1) * len(images_meta) // Config.NUM_THREADS num_images = to_idx - from_idx thread = Thread(target=download_images, args=(api, images_meta, from_idx, num_images, segmentation_path, thread_idx)) thread.setDaemon(True) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() if False: segmentation_path = os.path.join(Config.WORKSPACE_PATH, Config.SEGMENTATION_PATH) if not os.path.exists(segmentation_path): os.mkdir(segmentation_path) threads = list() for thread_idx in range(Config.NUM_THREADS): from_idx = (thread_idx + 0) * len(images_meta) // Config.NUM_THREADS to_idx = (thread_idx + 1) * len(images_meta) // Config.NUM_THREADS num_images = to_idx - from_idx thread = Thread(target=download_segmentation, args=(api, images_meta, from_idx, num_images, segmentation_path, thread_idx)) thread.setDaemon(True) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() if False: unfinded_masks = [ "584727129fc3c10f04859aad", "58470b479fc3c10f04859672" ] images_meta = object() with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) for image in images_meta: for segmentation in image["segmentation"]: if segmentation["_id"] in unfinded_masks: info(f'Segmentation {segmentation["_id"]} will removed') image["segmentation"].remove(segmentation) if len(images_meta) != 0: with open(path_to_images_meta, "w") as write_file: json.dump(images_meta, write_file, indent=4) info("File written") if False: images_meta = object() with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) for image in images_meta: if len(image["segmentation"]) == 0: info(f'Image {image["_id"]} will removed') images_meta.remove(image) if len(images_meta) != 0: with open(path_to_images_meta, "w") as write_file: json.dump(images_meta, write_file, indent=4) info("File written") if False: with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) images_ids = [] for image in images_meta: images_ids.append(image["_id"] + ".jpg") images_path = os.path.join(Config.WORKSPACE_PATH, Config.IMAGES_PATH) for image in os.listdir(images_path): if image not in images_ids: os.remove(os.path.join(images_path, image)) info(f'{image} deleted') if True: with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) num = 0 for image in images_meta: if "diagnosis" in image["meta"]["clinical"].keys() or \ "benign_malignant" in image["meta"]["clinical"].keys(): num += 1 print("Num:", num) with open(path_to_images_meta, "r") as read_file: images_meta = json.load(read_file) info(f'Number of images after script execution: {len(images_meta)}') print( "Benign:", len( list( filter( lambda x: x["meta"]["clinical"]["benign_malignant"] == "benign", images_meta)))) print( "Malignant:", len( list( filter( lambda x: x["meta"]["clinical"]["benign_malignant"] == "malignant", images_meta))))
import json import csv import pandas as pd from isic_api import ISICApi from pandas.io.json import json_normalize # Initialize the API; no login is necessary for public data api = ISICApi(username="******", password="******") outputFileName = 'imagedata' imageList = api.getJson('image?limit=25000&offset=0&sort=name') print('Fetching metadata for %s images' % len(imageList)) imageDetails = [] i = 0 for image in imageList: print(' ', image['name']) # Pull image details imageDetail = api.getJson('image/%s' % image['_id']) imageDetails.append(imageDetail) """ # Testing Parameters print("****************************") print(imageDetails[0]['meta']['clinical']['anatom_site_general']) print("****************************") data = json_normalize(imageDetails[0]) print(data.loc[0]) data = json_normalize(imageDetails[0]) print(data.loc[0]) print("========================================================")