Ejemplo n.º 1
0
def detect_gid_list(ibs, gid_list, tree_path_list, downsample=True, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        tree_path_list (list of str): the list of trees to load for detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the PyRF documentation for configuration settings

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [oldw / neww for oldw, neww in zip(oldw_list, neww_list)]
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
    # Run detection
    results_iter = detect(ibs, gpath_list, tree_path_list, **kwargs)
    # Upscale the results
    for gid, downsample, (gpath, result_list) in zip(gid_list, downsample_list, results_iter):
        # Upscale the results back up to the original image size
        if downsample is not None and downsample != 1.0:
            for result in result_list:
                for key in ['centerx', 'centery', 'xtl', 'ytl', 'width', 'height']:
                    result[key] = int(result[key] * downsample)
        yield gid, gpath, result_list
 def thumb_getter(id_, thumbsize=128):
     """ Thumb getters must conform to thumbtup structure """
     if id_ not in imgname_list:
         return {
             'fpath': id_ + '.jpg',
             'thread_func': thread_func,
             'main_func': lambda: (id_, ),
         }
     # print(id_)
     if id_ == 'doesnotexist.jpg':
         return None
         img_path = None
         img_size = (100, 100)
     else:
         img_path = ut.grab_test_imgpath(id_, verbose=False)
         img_size = vt.open_image_size(img_path)
     thumb_path = join(guitool_test_thumbdir,
                       ut.hashstr(str(img_path)) + '.jpg')
     if id_ == 'carl.jpg':
         bbox_list = [(10, 10, 200, 200)]
         theta_list = [0]
     elif id_ == 'lena.png':
         # bbox_list = [(10, 10, 200, 200)]
         bbox_list = [None]
         theta_list = [None]
     else:
         bbox_list = []
         theta_list = []
     interest_list = [False]
     thumbtup = (thumb_path, img_path, img_size, bbox_list, theta_list,
                 interest_list)
     # print('thumbtup = %r' % (thumbtup,))
     return thumbtup
Ejemplo n.º 3
0
def detect_gid_list(ibs, gid_list, tree_path_list, downsample=True, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        tree_path_list (list of str): the list of trees to load for detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the PyRF documentation for configuration settings

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [oldw / neww for oldw, neww in zip(oldw_list, neww_list)]
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
    # Run detection
    results_iter = detect(ibs, gpath_list, tree_path_list, **kwargs)
    # Upscale the results
    for gid, downsample, (gpath, result_list) in zip(gid_list, downsample_list, results_iter):
        # Upscale the results back up to the original image size
        if downsample is not None and downsample != 1.0:
            for result in result_list:
                for key in ["centerx", "centery", "xtl", "ytl", "width", "height"]:
                    result[key] = int(result[key] * downsample)
        yield gid, gpath, result_list
Ejemplo n.º 4
0
def detect_gid_list(ibs, gid_list, downsample=False, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the PyDarknet documentation for configuration settings

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [
            oldw / neww for oldw, neww in zip(oldw_list, neww_list)
        ]
        orient_list = [1] * len(gid_list)
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
        orient_list = ibs.get_image_orientation(gid_list)
    # Run detection
    results_iter = detect(gpath_list, **kwargs)
    # Upscale the results
    for downsample, gid, orient, (gpath,
                                  result_list) in zip(downsample_list,
                                                      gid_list, orient_list,
                                                      results_iter):
        # Upscale the results back up to the original image size
        for result in result_list:
            if downsample is not None and downsample != 1.0:
                for key in ['xtl', 'ytl', 'width', 'height']:
                    result[key] = int(result[key] * downsample)
            bbox = (
                result['xtl'],
                result['ytl'],
                result['width'],
                result['height'],
            )
            bbox_list = [bbox]
            bbox_list = ibs.fix_horizontal_bounding_boxes_to_orient(
                gid, bbox_list)
            bbox = bbox_list[0]
            result['xtl'], result['ytl'], result['width'], result[
                'height'] = bbox
        yield (gid, gpath, result_list)
Ejemplo n.º 5
0
def resize_imagelist_to_sqrtarea(gpath_list, new_gpath_list=None,
                                 sqrt_area=800, output_dir=None,
                                 checkexists=True,
                                 **kwargs):
    """ Resizes images and yeilds results asynchronously  """
    import vtool as vt
    target_area = sqrt_area ** 2
    # Read image sizes
    gsize_list = [vt.open_image_size(gpath) for gpath in gpath_list]
    # Compute new sizes which preserve aspect ratio
    newsize_list = [vt.ScaleStrat.area(target_area, wh) for wh in gsize_list]
    if new_gpath_list is None:
        # Compute names for the new images if not given
        if output_dir is None:
            # Create an output directory if not specified
            output_dir      = 'resized_sqrtarea%r' % sqrt_area
        ut.ensuredir(output_dir)
        size_suffixs =  ['_' + repr(newsize).replace(' ', '') for newsize in newsize_list]
        from os.path import basename
        old_gnames = [basename(p) for p in gpath_list]
        new_gname_list = [ut.augpath(p, suffix=s)
                          for p, s in zip(old_gnames, size_suffixs)]
        new_gpath_list = [join(output_dir, gname) for gname in new_gname_list]
        new_gpath_list = list(map(ut.unixpath, new_gpath_list))
    assert len(new_gpath_list) == len(gpath_list), 'unequal len'
    assert len(newsize_list) == len(gpath_list), 'unequal len'
    # Evaluate generator
    if checkexists:
        exists_list = list(map(exists, new_gpath_list))
        gpath_list_ = ut.filterfalse_items(gpath_list, exists_list)
        new_gpath_list_ = ut.filterfalse_items(new_gpath_list, exists_list)
        newsize_list_ = ut.filterfalse_items(newsize_list, exists_list)
    else:
        gpath_list_ = gpath_list
        new_gpath_list_ = new_gpath_list
        newsize_list_ = newsize_list
    generator = resize_imagelist_generator(gpath_list_, new_gpath_list_,
                                           newsize_list_, **kwargs)
    for res in generator:
        pass
    #return [res for res in generator]
    return new_gpath_list
Ejemplo n.º 6
0
def detect_gid_list(ibs, gid_list, downsample=False, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the PyDarknet documentation for configuration settings

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [oldw / neww for oldw, neww in zip(oldw_list, neww_list)]
        orient_list = [1] * len(gid_list)
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
        orient_list = ibs.get_image_orientation(gid_list)
    # Run detection
    results_iter = detect(gpath_list, **kwargs)
    # Upscale the results
    for downsample, gid, orient, (gpath, result_list) in zip(downsample_list, gid_list, orient_list, results_iter):
        # Upscale the results back up to the original image size
        for result in result_list:
            if downsample is not None and downsample != 1.0:
                for key in ['xtl', 'ytl', 'width', 'height']:
                    result[key] = int(result[key] * downsample)
            bbox = (result['xtl'], result['ytl'], result['width'], result['height'], )
            bbox_list = [ bbox ]
            bbox_list = ibs.fix_horizontal_bounding_boxes_to_orient(gid, bbox_list)
            bbox = bbox_list[0]
            result['xtl'], result['ytl'], result['width'], result['height'] = bbox
        yield (gid, gpath, result_list)
Ejemplo n.º 7
0
def check_image_sizes(data_uri_order, all_kpts, offset_list):
    """
    Check if any keypoints go out of bounds wrt their associated images
    """
    import vtool as vt
    from os.path import join
    imgdir = ut.truepath('/raid/work/Oxford/oxbuild_images')
    gpath_list = [join(imgdir, imgid + '.jpg') for imgid in data_uri_order]
    imgsize_list = [vt.open_image_size(gpath) for gpath in gpath_list]
    kpts_list = [all_kpts[l:r] for l, r in ut.itertwo(offset_list)]

    kpts_extent = [
        vt.get_kpts_image_extent(kpts, outer=False, only_xy=False)
        for kpts in ut.ProgIter(kpts_list, 'kpts extent')
    ]

    for i, (size, extent) in enumerate(zip(imgsize_list, kpts_extent)):
        w, h = size
        _, maxx, _, maxy = extent
        assert np.isnan(maxx) or maxx < w
        assert np.isnan(maxy) or maxy < h
Ejemplo n.º 8
0
 def thumb_getter(id_, thumbsize=128):
     """ Thumb getters must conform to thumbtup structure """
     #print(id_)
     if id_ == 'doesnotexist.jpg':
         return None
         img_path = None
         img_size = (100, 100)
     else:
         img_path = ut.grab_test_imgpath(id_, verbose=False)
         img_size = vt.open_image_size(img_path)
     thumb_path = join(guitool_test_thumbdir, ut.hashstr(str(img_path)) + '.jpg')
     if id_ == 'carl.jpg':
         bbox_list = [(10, 10, 200, 200)]
         theta_list = [0]
     elif id_ == 'lena.png':
         #bbox_list = [(10, 10, 200, 200)]
         bbox_list = [None]
         theta_list = [None]
     else:
         bbox_list = []
         theta_list = []
     thumbtup = (thumb_path, img_path, img_size, bbox_list, theta_list)
     #print('thumbtup = %r' % (thumbtup,))
     return thumbtup
def read_thumb_size(thumb_path):
    import vtool as vt

    if VERBOSE_THUMB:
        print('[ThumbDelegate] Reading thumb size')
    # npimg = vt.imread(thumb_path, delete_if_corrupted=True)
    # (height, width) = npimg.shape[0:2]
    # del npimg
    try:
        width, height = vt.open_image_size(thumb_path)
    except IOError as ex:
        if ut.checkpath(thumb_path, verbose=True):
            ut.printex(
                ex,
                'image=%r seems corrupted. Needs deletion' % (thumb_path, ),
                iswarning=True,
            )
            ut.delete(thumb_path)
        else:
            ut.printex(ex,
                       'image=%r does not exist', (thumb_path, ),
                       iswarning=True)
        raise
    return width, height
Ejemplo n.º 10
0
def download_sharks(XMLdata, number):
    """
    cd ~/work/WS_ALL
    python -m ibeis.scripts.getshark

    >>> from ibeis.scripts.getshark import *  # NOQA
    >>> url = 'www.whaleshark.org/listImages.jsp'
    >>> XMLdata = ut.url_read(url)
    >>> number = None
    """
    # Prepare the output directory for writing, if it doesn't exist
    output_dir = 'sharkimages'
    ut.ensuredir(output_dir)

    dom = parseString(XMLdata)

    # Download files
    if number:
        maxCount = min(number, len(dom.getElementsByTagName('img')))
    else:
        maxCount = len(dom.getElementsByTagName('img'))

    parsed_info = dict(
        img_url_list=[],
        localid_list=[],
        nameid_list=[],
        orig_fname_list=[],
        new_fname_list=[],
    )

    print('Preparing to fetch %i files...' % maxCount)

    for shark in dom.getElementsByTagName('shark'):
        localCount = 0
        for imageset in shark.getElementsByTagName('imageset'):
            for img in imageset.getElementsByTagName('img'):
                localCount += 1

                img_url = img.getAttribute('href')
                orig_fname = split(img_url)[1]
                ext = splitext(orig_fname)[1].lower()
                nameid = shark.getAttribute('number')

                new_fname = '%s-%i%s' % (
                    nameid, localCount, ext)

                parsed_info['img_url_list'].append(img_url)
                parsed_info['nameid_list'].append(nameid)
                parsed_info['localid_list'].append(localCount)
                parsed_info['orig_fname_list'].append(orig_fname)
                parsed_info['new_fname_list'].append(new_fname)

                print('Parsed %i / %i files.' % (len(parsed_info['orig_fname_list']), maxCount))

                if number is not None and len(parsed_info['orig_fname_list']) == number:
                    break
    parsed_info['new_fpath_list'] = [join(output_dir, _fname)
                                     for _fname in parsed_info['new_fname_list']]

    print('Filtering parsed images')

    # Filter based on image type (keep only jpgs)
    ext_flags = [_fname.endswith('.jpg') or _fname.endswith('.jpg')
                  for _fname in parsed_info['new_fname_list']]
    parsed_info = {key: ut.compress(list_, ext_flags) for key, list_ in parsed_info.items()}

    # Filter to only images matching the appropriate tags
    from ibeis import tag_funcs
    parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list'])
    tag_flags = tag_funcs.filterflags_general_tags(
        parsed_info['tags_list'],
        has_any=['view-left'],
        none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
    )
    parsed_info = {key: ut.compress(list_, tag_flags) for key, list_ in parsed_info.items()}
    print('Tags in chosen images:')
    print(ut.dict_hist(ut.flatten(parsed_info['tags_list'] )))

    # Download selected subset
    print('Downloading selected subset')
    _iter = list(zip(parsed_info['img_url_list'],
                     parsed_info['new_fpath_list']))
    _iter = ut.ProgressIter(_iter, lbl='downloading sharks')
    for img_url, new_fpath in _iter:
        if not exists(new_fpath):
            ut.download_url(img_url, new_fpath)

    # Remove corrupted or ill-formatted images
    print('Checking for corrupted images')
    import vtool as vt
    noncorrupt_flags = vt.filterflags_valid_images(parsed_info['new_fpath_list'])
    parsed_info = {
        key: ut.compress(list_, noncorrupt_flags)
        for key, list_ in parsed_info.items()
    }

    print('Removing small images')
    import numpy as np
    imgsize_list = np.array([vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']])
    sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1))
    areq_flags_list = sqrt_area_list >= 750
    parsed_info = {key: ut.compress(list_, areq_flags_list)
                   for key, list_ in parsed_info.items()}

    grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))),
                                  parsed_info['nameid_list'])
    keep_idxs = sorted(ut.flatten([idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2]))
    parsed_info = {key: ut.take(list_, keep_idxs) for key, list_ in parsed_info.items()}

    print('Moving imagse to secondary directory')
    named_outputdir = 'named-left-sharkimages'
    # Build names
    parsed_info['namedir_fpath_list'] = [
        join(named_outputdir, _nameid, _fname)
        for _fname, _nameid in zip(parsed_info['new_fname_list'],
                                   parsed_info['nameid_list'])]
    # Create directories
    ut.ensuredir(named_outputdir)
    named_dirs = ut.unique_ordered(list(map(dirname, parsed_info['namedir_fpath_list'])))
    for dir_ in named_dirs:
        ut.ensuredir(dir_)
    # Copy
    ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'],
                     dst_fpath_list=parsed_info['namedir_fpath_list'])
Ejemplo n.º 11
0
def detect_gid_list(ibs, gid_list, downsample=True, verbose=VERBOSE_SS, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the SSD documentation for configuration settings

    Args:
        ibs (ibeis.IBEISController):  image analysis api
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
                sizes should be used; defaults to True

    Kwargs:
        detector, config_filepath, weights_filepath, verbose

    Yields:
        tuple: (gid, gpath, result_list)

    CommandLine:
        python -m ibeis.algo.detect.ssd detect_gid_list --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.detect.ssd import *  # NOQA
        >>> from ibeis.core_images import LocalizerConfig
        >>> import ibeis
        >>> ibs = ibeis.opendb('testdb1')
        >>> gid_list = ibs.get_valid_gids()
        >>> config = {'verbose': True}
        >>> downsample = False
        >>> results_list = detect_gid_list(ibs, gid_list, downsample, **config)
        >>> results_list = list(results_list)
        >>> print('result lens = %r' % (map(len, list(results_list))))
        >>> print('result[0] = %r' % (len(list(results_list[0][2]))))
        >>> config = {'verbose': True}
        >>> downsample = False
        >>> results_list = detect_gid_list(ibs, gid_list, downsample, **config)
        >>> results_list = list(results_list)
        >>> print('result lens = %r' % (map(len, list(results_list))))
        >>> print('result[0] = %r' % (len(list(results_list[0][2]))))
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> ut.show_if_requested()

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [oldw / neww for oldw, neww in zip(oldw_list, neww_list)]
        orient_list = [1] * len(gid_list)
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
        orient_list = ibs.get_image_orientation(gid_list)
    # Run detection
    results_iter = detect(gpath_list, verbose=verbose, **kwargs)
    # Upscale the results
    _iter = zip(downsample_list, gid_list, orient_list, results_iter)
    for downsample, gid, orient, (gpath, result_list) in _iter:
        # Upscale the results back up to the original image size
        for result in result_list:
            if downsample is not None and downsample != 1.0:
                for key in ['xtl', 'ytl', 'width', 'height']:
                    result[key] = int(result[key] * downsample)
            bbox = (result['xtl'], result['ytl'], result['width'], result['height'], )
            bbox_list = [ bbox ]
            bbox = bbox_list[0]
            result['xtl'], result['ytl'], result['width'], result['height'] = bbox
        yield (gid, gpath, result_list)
Ejemplo n.º 12
0
def detect_gid_list(ibs, gid_list, downsample=False, **kwargs):
    """
    Args:
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
            sizes should be used; defaults to True

            True:  ibs.get_image_detectpaths() is used
            False: ibs.get_image_paths() is used

    Kwargs (optional): refer to the PyDarknet documentation for configuration settings

    Args:
        ibs (ibeis.IBEISController):  image analysis api
        gid_list (list of int): the list of IBEIS image_rowids that need detection
        downsample (bool, optional): a flag to indicate if the original image
                sizes should be used; defaults to True

    Kwargs:
        detector, config_filepath, weights_filepath, verbose

    Yields:
        tuple: (gid, gpath, result_list)

    CommandLine:
        python -m ibeis.algo.detect.yolo detect_gid_list --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from ibeis.algo.detect.yolo import *  # NOQA
        >>> from ibeis.core_images import LocalizerConfig
        >>> import ibeis
        >>> ibs = ibeis.opendb(defaultdb='WS_ALL')
        >>> gid_list = ibs.images()._rowids[0:1]
        >>> kwargs = config = LocalizerConfig(**{
        >>>     'weights_filepath': '/media/raid/work/WS_ALL/localizer_backup/detect.yolo.2.39000.weights',
        >>>     'config_filepath': '/media/raid/work/WS_ALL/localizer_backup/detect.yolo.2.cfg',
        >>> })
        >>> exec(ut.execstr_dict(config), globals())
        >>> #classes_fpath = '/media/raid/work/WS_ALL/localizer_backup/detect.yolo.2.cfg.classes'
        >>> downsample = False
        >>> (gid, gpath, result_list) = detect_gid_list(ibs, gid_list, downsample, **config)
        >>> result = ('(gid, gpath, result_list) = %s' % (ut.repr2((gid, gpath, result_list)),))
        >>> print(result)
        >>> ut.quit_if_noshow()
        >>> import plottool as pt
        >>> ut.show_if_requested()

    Yields:
        results (list of dict)
    """
    # Get new gpaths if downsampling
    if downsample:
        gpath_list = ibs.get_image_detectpaths(gid_list)
        neww_list = [vt.open_image_size(gpath)[0] for gpath in gpath_list]
        oldw_list = [oldw for (oldw, oldh) in ibs.get_image_sizes(gid_list)]
        downsample_list = [
            oldw / neww for oldw, neww in zip(oldw_list, neww_list)
        ]
        orient_list = [1] * len(gid_list)
    else:
        gpath_list = ibs.get_image_paths(gid_list)
        downsample_list = [None] * len(gpath_list)
        orient_list = ibs.get_image_orientation(gid_list)
    # Run detection
    results_iter = detect(gpath_list, **kwargs)
    # Upscale the results
    _iter = zip(downsample_list, gid_list, orient_list, results_iter)
    for downsample, gid, orient, (gpath, result_list) in _iter:
        # Upscale the results back up to the original image size
        for result in result_list:
            if downsample is not None and downsample != 1.0:
                for key in ['xtl', 'ytl', 'width', 'height']:
                    result[key] = int(result[key] * downsample)
            bbox = (
                result['xtl'],
                result['ytl'],
                result['width'],
                result['height'],
            )
            bbox_list = [bbox]
            bbox = bbox_list[0]
            result['xtl'], result['ytl'], result['width'], result[
                'height'] = bbox
        yield (gid, gpath, result_list)
Ejemplo n.º 13
0
def download_sharks(XMLdata, number):
    """
    cd ~/work/WS_ALL
    python -m ibeis.scripts.getshark

    >>> from ibeis.scripts.getshark import *  # NOQA
    >>> url = 'www.whaleshark.org/listImages.jsp'
    >>> XMLdata = ut.url_read(url)
    >>> number = None
    """
    # Prepare the output directory for writing, if it doesn't exist
    output_dir = 'sharkimages'
    ut.ensuredir(output_dir)

    dom = parseString(XMLdata)

    # Download files
    if number:
        maxCount = min(number, len(dom.getElementsByTagName('img')))
    else:
        maxCount = len(dom.getElementsByTagName('img'))

    parsed_info = dict(
        img_url_list=[],
        localid_list=[],
        nameid_list=[],
        orig_fname_list=[],
        new_fname_list=[],
    )

    print('Preparing to fetch %i files...' % maxCount)

    for shark in dom.getElementsByTagName('shark'):
        localCount = 0
        for imageset in shark.getElementsByTagName('imageset'):
            for img in imageset.getElementsByTagName('img'):
                localCount += 1

                img_url = img.getAttribute('href')
                orig_fname = split(img_url)[1]
                ext = splitext(orig_fname)[1].lower()
                nameid = shark.getAttribute('number')

                new_fname = '%s-%i%s' % (nameid, localCount, ext)

                parsed_info['img_url_list'].append(img_url)
                parsed_info['nameid_list'].append(nameid)
                parsed_info['localid_list'].append(localCount)
                parsed_info['orig_fname_list'].append(orig_fname)
                parsed_info['new_fname_list'].append(new_fname)

                print('Parsed %i / %i files.' %
                      (len(parsed_info['orig_fname_list']), maxCount))

                if number is not None and len(
                        parsed_info['orig_fname_list']) == number:
                    break
    parsed_info['new_fpath_list'] = [
        join(output_dir, _fname) for _fname in parsed_info['new_fname_list']
    ]

    print('Filtering parsed images')

    # Filter based on image type (keep only jpgs)
    ext_flags = [
        _fname.endswith('.jpg') or _fname.endswith('.jpg')
        for _fname in parsed_info['new_fname_list']
    ]
    parsed_info = {
        key: ut.compress(list_, ext_flags)
        for key, list_ in parsed_info.items()
    }

    # Filter to only images matching the appropriate tags
    from ibeis import tag_funcs
    parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list'])
    tag_flags = tag_funcs.filterflags_general_tags(
        parsed_info['tags_list'],
        has_any=['view-left'],
        none_match=['qual.*', 'view-top', 'part-.*', 'cropped'],
    )
    parsed_info = {
        key: ut.compress(list_, tag_flags)
        for key, list_ in parsed_info.items()
    }
    print('Tags in chosen images:')
    print(ut.dict_hist(ut.flatten(parsed_info['tags_list'])))

    # Download selected subset
    print('Downloading selected subset')
    _iter = list(
        zip(parsed_info['img_url_list'], parsed_info['new_fpath_list']))
    _iter = ut.ProgressIter(_iter, lbl='downloading sharks')
    for img_url, new_fpath in _iter:
        if not exists(new_fpath):
            ut.download_url(img_url, new_fpath)

    # Remove corrupted or ill-formatted images
    print('Checking for corrupted images')
    import vtool as vt
    noncorrupt_flags = vt.filterflags_valid_images(
        parsed_info['new_fpath_list'])
    parsed_info = {
        key: ut.compress(list_, noncorrupt_flags)
        for key, list_ in parsed_info.items()
    }

    print('Removing small images')
    import numpy as np
    imgsize_list = np.array(
        [vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']])
    sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1))
    areq_flags_list = sqrt_area_list >= 750
    parsed_info = {
        key: ut.compress(list_, areq_flags_list)
        for key, list_ in parsed_info.items()
    }

    grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))),
                                  parsed_info['nameid_list'])
    keep_idxs = sorted(
        ut.flatten(
            [idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2]))
    parsed_info = {
        key: ut.take(list_, keep_idxs)
        for key, list_ in parsed_info.items()
    }

    print('Moving imagse to secondary directory')
    named_outputdir = 'named-left-sharkimages'
    # Build names
    parsed_info['namedir_fpath_list'] = [
        join(named_outputdir, _nameid, _fname) for _fname, _nameid in zip(
            parsed_info['new_fname_list'], parsed_info['nameid_list'])
    ]
    # Create directories
    ut.ensuredir(named_outputdir)
    named_dirs = ut.unique_ordered(
        list(map(dirname, parsed_info['namedir_fpath_list'])))
    for dir_ in named_dirs:
        ut.ensuredir(dir_)
    # Copy
    ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'],
                     dst_fpath_list=parsed_info['namedir_fpath_list'])