Example #1
0
def run(args):
    options = configure_arg_parser(args)

    c = Crawler(options)
    Post.to_csv(c.posts)

    file_names = [image for p in c.posts for image in p.images]
    downloader.download_images(file_names, partial(progress_reporter, total_items=len(file_names)))
Example #2
0
def run(args):
    options = configure_arg_parser(args)

    c = Crawler(options)
    Post.to_csv(c.posts)

    file_names = [image for p in c.posts for image in p.images]
    downloader.download_images(
        file_names, partial(progress_reporter, total_items=len(file_names)))
Example #3
0
def download():
    for species in all_species:

        print('Getting urls for', species)
        urls = get_urls(species, images_per_species)

        print('Downlaing images for', species)
        path = os.path.join('data', species)
        
        download_images(urls, path)
Example #4
0
def download_test():
    print("Test Set")
    for setting in all_types:

        print('Getting urls for', setting)
        urls = get_urls(setting, test_images)

        print('Downloading images for', setting)
        path = os.path.join('data', 'test', setting)

        download_images(urls, path)
Example #5
0
def download_train():
    print("Training Set")
    for setting in all_types:

        print('Getting urls for', setting)
        urls = get_urls(setting, train_images)

        print('Downloading images for', setting)
        path = os.path.join('data', 'train', setting)

        download_images(urls, path)
Example #6
0
def download(outputPath):
    for specie in all_species:

        print('Getting urls for', specie)
        urls = get_urls(specie, img_per_specie, sizes, api_param)
        print('Downloading images for', specie)
        if outputPath == 'data/':
            path = os.path.join('data', specie, 'raw')
            download_images(urls, path)
        else:
            path = outputPath
            path = path + specie + '/raw'
            download_images(urls, path)
Example #7
0
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords", type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver", "-d", type=str, default="chrome_headless",
                        help="Image search engine.", choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number", "-n", type=int, default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=50,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=False,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine, max_number=args.max_number,
                                            face_only=args.face_only, safe_mode=args.safe_mode,
                                            proxy_type=proxy_type, proxy=proxy,
                                            browser=args.driver)
    downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                               concurrency=args.num_threads, timeout=args.timeout,
                               proxy_type=proxy_type, proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")
Example #8
0
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    #parser.add_argument("keywords", type=str,
    #                    help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--max-number", "-n", type=int, default=50000,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=16,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=True,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default='web-proxy.tencent.com:8080',
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5
    for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']:
    #for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']:

        for _engine in ["Google", "Bing", "Baidu"]:
            crawled_urls = crawler.crawl_image_urls(_keywords,
                                                    engine=_engine, max_number=args.max_number,
                                                    face_only=args.face_only, safe_mode=args.safe_mode,
                                                    proxy_type=proxy_type, proxy=proxy)
            downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                                       concurrency=args.num_threads, timeout=args.timeout,
                                       proxy_type=proxy_type, proxy=proxy,
                                       file_prefix=_engine + '_' + hashlib.md5(_keywords.encode()).hexdigest())

    print("Finished.")
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords", type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--max-number", "-n", type=int, default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=50,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=False,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine, max_number=args.max_number,
                                            face_only=args.face_only, safe_mode=args.safe_mode,
                                            proxy_type=proxy_type, proxy=proxy,
                                            browser="phantomjs")
    downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                               concurrency=args.num_threads, timeout=args.timeout,
                               proxy_type=proxy_type, proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")
def run_keyword(keyword, args, proxy, proxy_type):
    crawled_urls = crawler.crawl_image_urls(keyword,
                                            engine=args.engine,
                                            max_number=args.max_number,
                                            face_only=args.face_only,
                                            safe_mode=args.safe_mode,
                                            proxy_type=proxy_type,
                                            proxy=proxy,
                                            browser=args.driver)

    dst_dir = os.path.join(args.output, keyword)
    downloader.download_images(image_urls=crawled_urls,
                               dst_dir=dst_dir,
                               concurrency=args.num_threads,
                               timeout=args.timeout,
                               proxy_type=proxy_type,
                               proxy=proxy,
                               file_prefix=args.engine)
Example #11
0
import scraper
import downloader
import global_variables as gv


keywords_list = [
    "face with eyes closed",
    "frontal face",
]

dst_dir = "/images/eye/"

for keywords in keywords_list:
    img_dir = gv.root_dir + dst_dir + keywords

    scraped_urls = scraper.scrape_image_urls(keywords, 100, face_only=True, safe_mode=True,
                                             proxy="192.168.0.92:1080", proxy_type="socks5")

    downloader.download_images(scraped_urls, img_dir, concurrency=50)
def google_download(argv):

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords",
                        type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine",
                        "-e",
                        type=str,
                        default="Google",
                        help="Image search engine.",
                        choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver",
                        "-d",
                        type=str,
                        default="chrome_headless",
                        help="Image search engine.",
                        choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number",
                        "-n",
                        type=int,
                        default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument(
        "--num-threads",
        "-j",
        type=int,
        default=50,
        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout",
                        "-t",
                        type=int,
                        default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument(
        "--safe-mode",
        "-S",
        action="store_true",
        default=False,
        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--label-age",
                        "-l",
                        action="store_true",
                        default=True,
                        help="extract the age ")
    parser.add_argument("--birthdate",
                        "-B",
                        type=str,
                        default=None,
                        help="birthdate of the searched person")
    parser.add_argument("--face-only",
                        "-F",
                        action="store_true",
                        default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http",
                        "-ph",
                        type=str,
                        default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5",
                        "-ps",
                        type=str,
                        default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)
    # argv = ['-e', 'Google', '-d', 'chrome_headless', '-n', '40', '-j', '10', '-o', 'img/google/kids10/Colin_Baiocchi', '-F', '-S', 'Colin Baiocchi']

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    if args.label_age and args.birthdate is None:
        raise RuntimeError("Birthdate is necessary if args.label_age is True")

    sleep_time = 2
    num_retires = 4

    for x in range(num_retires):
        """
        selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
        (unknown error: DevToolsActivePort file doesn't exist)
        (The process started from chrome location /usr/bin/chromium is no longer running, so ChromeDriver is assuming that Chrome has crashed.)

        """
        try:
            crawled_urls = crawler.crawl_image_urls(args.keywords,
                                                    engine=args.engine,
                                                    max_number=args.max_number,
                                                    face_only=args.face_only,
                                                    safe_mode=args.safe_mode,
                                                    proxy_type=proxy_type,
                                                    proxy=proxy,
                                                    browser=args.driver)
            downloader.download_images(image_urls=crawled_urls,
                                       dst_dir=args.output,
                                       concurrency=args.num_threads,
                                       timeout=args.timeout,
                                       proxy_type=proxy_type,
                                       proxy=proxy,
                                       file_prefix=args.keywords)
        except WebDriverException:
            sleep(sleep_time)
            pass
        else:
            break

    ageLabeler = ExifImageAgeLabeler()
    # dir = "Image-Downloader/download_images/google/kids10"
    files = os.listdir(args.output)
    # files = [file for file in files if os.path.isfile(file)]
    for fn in files:
        age, _ = ageLabeler.label_age(fn,
                                      birthdate_str=args.birthdate,
                                      image_dir=args.output)
        if age is not None:
            src = os.path.join(args.output, fn)
            imagename_with_age = os.path.splitext(fn)[0] + "|{}".format(
                age) + os.path.splitext(fn)[1]
            dst = os.path.join(args.output, imagename_with_age)
            os.rename(src, dst)
        else:
            src = os.path.join(args.output, fn)
            os.remove(src)

    logger.info("Finished.")

    return len(os.listdir(args.output))
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords",
                        type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine",
                        "-e",
                        type=str,
                        default="Google",
                        help="Image search engine.",
                        choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver",
                        "-d",
                        type=str,
                        default="chrome_headless",
                        help="Image search engine.",
                        choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number",
                        "-n",
                        type=int,
                        default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument(
        "--num-threads",
        "-j",
        type=int,
        default=50,
        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout",
                        "-t",
                        type=int,
                        default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument(
        "--safe-mode",
        "-S",
        action="store_true",
        default=False,
        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only",
                        "-F",
                        action="store_true",
                        default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http",
                        "-ph",
                        type=str,
                        default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5",
                        "-ps",
                        type=str,
                        default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")
    # type is not supported for Baidu
    parser.add_argument("--type",
                        "-ty",
                        type=str,
                        default=None,
                        help="What kinds of images to download.",
                        choices=["clipart", "linedrawing", "photograph"])
    # Bing: color for colored images, bw for black&white images, other color contains Red, orange, yellow, green
    # Teal, Blue, Purple, Pink, Brown, Black, Gray, White
    # Baidu: white, bw, black, pink, blue, red, yellow, purple, green, teal, orange, brown
    # Google: bw, red, orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown
    parser.add_argument("--color",
                        "-cl",
                        type=str,
                        default=None,
                        help="Specify the color of desired images.")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine,
                                            max_number=args.max_number,
                                            face_only=args.face_only,
                                            safe_mode=args.safe_mode,
                                            proxy_type=proxy_type,
                                            proxy=proxy,
                                            browser=args.driver,
                                            image_type=args.type,
                                            color=args.color)
    downloader.download_images(image_urls=crawled_urls,
                               dst_dir=args.output,
                               concurrency=args.num_threads,
                               timeout=args.timeout,
                               proxy_type=proxy_type,
                               proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")
Example #14
0
def download_images_by_hashtag(tag, number, debug):
    ins_crawler = InsCrawler(has_screen=debug)
    posts = ins_crawler.get_latest_posts_by_tag(tag, number)
    download_images(posts, tag)