def main(argv): parser = argparse.ArgumentParser(description="Image Downloader") parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")') parser.add_argument("--engine", "-e", type=str, default="Google", help="Image search engine.", choices=["Google", "Bing", "Baidu"]) parser.add_argument("--driver", "-d", type=str, default="chrome_headless", help="Image search engine.", choices=["chrome_headless", "chrome", "phantomjs"]) parser.add_argument("--max-number", "-n", type=int, default=100, help="Max number of images download for the keywords.") parser.add_argument("--num-threads", "-j", type=int, default=50, help="Number of threads to concurrently download images.") parser.add_argument("--timeout", "-t", type=int, default=20, help="Seconds to timeout when download an image.") parser.add_argument("--output", "-o", type=str, default="./download_images", help="Output directory to save downloaded images.") parser.add_argument("--safe-mode", "-S", action="store_true", default=False, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--face-only", "-F", action="store_true", default=False, help="Only search for ") parser.add_argument("--proxy_http", "-ph", type=str, default=None, help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, help="Set socks5 proxy (e.g. 192.168.0.2:1080)") args = parser.parse_args(args=argv) proxy_type = None proxy = None if args.proxy_http is not None: proxy_type = "http" proxy = args.proxy_http elif args.proxy_socks5 is not None: proxy_type = "socks5" proxy = args.proxy_socks5 crawled_urls = crawler.crawl_image_urls(args.keywords, engine=args.engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy, browser=args.driver) downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=args.engine) print("Finished.")
def main(argv): parser = argparse.ArgumentParser(description="Image Downloader") #parser.add_argument("keywords", type=str, # help='Keywords to search. ("in quotes")') parser.add_argument("--engine", "-e", type=str, default="Google", help="Image search engine.", choices=["Google", "Bing", "Baidu"]) parser.add_argument("--max-number", "-n", type=int, default=50000, help="Max number of images download for the keywords.") parser.add_argument("--num-threads", "-j", type=int, default=16, help="Number of threads to concurrently download images.") parser.add_argument("--timeout", "-t", type=int, default=20, help="Seconds to timeout when download an image.") parser.add_argument("--output", "-o", type=str, default="./download_images", help="Output directory to save downloaded images.") parser.add_argument("--safe-mode", "-S", action="store_true", default=True, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--face-only", "-F", action="store_true", default=False, help="Only search for ") parser.add_argument("--proxy_http", "-ph", type=str, default='web-proxy.tencent.com:8080', help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, help="Set socks5 proxy (e.g. 192.168.0.2:1080)") args = parser.parse_args(args=argv) proxy_type = None proxy = None if args.proxy_http is not None: proxy_type = "http" proxy = args.proxy_http elif args.proxy_socks5 is not None: proxy_type = "socks5" proxy = args.proxy_socks5 for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']: #for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']: for _engine in ["Google", "Bing", "Baidu"]: crawled_urls = crawler.crawl_image_urls(_keywords, engine=_engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy) downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=_engine + '_' + hashlib.md5(_keywords.encode()).hexdigest()) print("Finished.")
def main(argv): parser = argparse.ArgumentParser(description="Image Downloader") parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")') parser.add_argument("--engine", "-e", type=str, default="Google", help="Image search engine.", choices=["Google", "Bing", "Baidu"]) parser.add_argument("--max-number", "-n", type=int, default=100, help="Max number of images download for the keywords.") parser.add_argument("--num-threads", "-j", type=int, default=50, help="Number of threads to concurrently download images.") parser.add_argument("--timeout", "-t", type=int, default=20, help="Seconds to timeout when download an image.") parser.add_argument("--output", "-o", type=str, default="./download_images", help="Output directory to save downloaded images.") parser.add_argument("--safe-mode", "-S", action="store_true", default=False, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--face-only", "-F", action="store_true", default=False, help="Only search for ") parser.add_argument("--proxy_http", "-ph", type=str, default=None, help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, help="Set socks5 proxy (e.g. 192.168.0.2:1080)") args = parser.parse_args(args=argv) proxy_type = None proxy = None if args.proxy_http is not None: proxy_type = "http" proxy = args.proxy_http elif args.proxy_socks5 is not None: proxy_type = "socks5" proxy = args.proxy_socks5 crawled_urls = crawler.crawl_image_urls(args.keywords, engine=args.engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy, browser="phantomjs") downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=args.engine) print("Finished.")
def run_keyword(keyword, args, proxy, proxy_type): crawled_urls = crawler.crawl_image_urls(keyword, engine=args.engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy, browser=args.driver) dst_dir = os.path.join(args.output, keyword) downloader.download_images(image_urls=crawled_urls, dst_dir=dst_dir, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=args.engine)
def fetch_image_urls(query: str, limit: int = 20, file_type: str = '', filters: str = '', extra_query_params: str = '') -> List[str]: result = list() keywords = query if len(file_type) > 0: keywords = query + " " + file_type urls = crawl_image_urls(keywords, filters, limit, extra_query_params=extra_query_params) for url in urls: if isValidURL(url, file_type) and url not in result: result.append(url) if len(result) >= limit: break return result
def google_download(argv): logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) parser = argparse.ArgumentParser(description="Image Downloader") parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")') parser.add_argument("--engine", "-e", type=str, default="Google", help="Image search engine.", choices=["Google", "Bing", "Baidu"]) parser.add_argument("--driver", "-d", type=str, default="chrome_headless", help="Image search engine.", choices=["chrome_headless", "chrome", "phantomjs"]) parser.add_argument("--max-number", "-n", type=int, default=100, help="Max number of images download for the keywords.") parser.add_argument( "--num-threads", "-j", type=int, default=50, help="Number of threads to concurrently download images.") parser.add_argument("--timeout", "-t", type=int, default=20, help="Seconds to timeout when download an image.") parser.add_argument("--output", "-o", type=str, default="./download_images", help="Output directory to save downloaded images.") parser.add_argument( "--safe-mode", "-S", action="store_true", default=False, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--label-age", "-l", action="store_true", default=True, help="extract the age ") parser.add_argument("--birthdate", "-B", type=str, default=None, help="birthdate of the searched person") parser.add_argument("--face-only", "-F", action="store_true", default=False, help="Only search for ") parser.add_argument("--proxy_http", "-ph", type=str, default=None, help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, help="Set socks5 proxy (e.g. 192.168.0.2:1080)") args = parser.parse_args(args=argv) # argv = ['-e', 'Google', '-d', 'chrome_headless', '-n', '40', '-j', '10', '-o', 'img/google/kids10/Colin_Baiocchi', '-F', '-S', 'Colin Baiocchi'] proxy_type = None proxy = None if args.proxy_http is not None: proxy_type = "http" proxy = args.proxy_http elif args.proxy_socks5 is not None: proxy_type = "socks5" proxy = args.proxy_socks5 if args.label_age and args.birthdate is None: raise RuntimeError("Birthdate is necessary if args.label_age is True") sleep_time = 2 num_retires = 4 for x in range(num_retires): """ selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally. (unknown error: DevToolsActivePort file doesn't exist) (The process started from chrome location /usr/bin/chromium is no longer running, so ChromeDriver is assuming that Chrome has crashed.) """ try: crawled_urls = crawler.crawl_image_urls(args.keywords, engine=args.engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy, browser=args.driver) downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=args.keywords) except WebDriverException: sleep(sleep_time) pass else: break ageLabeler = ExifImageAgeLabeler() # dir = "Image-Downloader/download_images/google/kids10" files = os.listdir(args.output) # files = [file for file in files if os.path.isfile(file)] for fn in files: age, _ = ageLabeler.label_age(fn, birthdate_str=args.birthdate, image_dir=args.output) if age is not None: src = os.path.join(args.output, fn) imagename_with_age = os.path.splitext(fn)[0] + "|{}".format( age) + os.path.splitext(fn)[1] dst = os.path.join(args.output, imagename_with_age) os.rename(src, dst) else: src = os.path.join(args.output, fn) os.remove(src) logger.info("Finished.") return len(os.listdir(args.output))
def main(argv): parser = argparse.ArgumentParser(description="Image Downloader") parser.add_argument("keywords", type=str, help='Keywords to search. ("in quotes")') parser.add_argument("--engine", "-e", type=str, default="Google", help="Image search engine.", choices=["Google", "Bing", "Baidu"]) parser.add_argument("--driver", "-d", type=str, default="chrome_headless", help="Image search engine.", choices=["chrome_headless", "chrome", "phantomjs"]) parser.add_argument("--max-number", "-n", type=int, default=100, help="Max number of images download for the keywords.") parser.add_argument( "--num-threads", "-j", type=int, default=50, help="Number of threads to concurrently download images.") parser.add_argument("--timeout", "-t", type=int, default=20, help="Seconds to timeout when download an image.") parser.add_argument("--output", "-o", type=str, default="./download_images", help="Output directory to save downloaded images.") parser.add_argument( "--safe-mode", "-S", action="store_true", default=False, help="Turn on safe search mode. (Only effective in Google)") parser.add_argument("--face-only", "-F", action="store_true", default=False, help="Only search for ") parser.add_argument("--proxy_http", "-ph", type=str, default=None, help="Set http proxy (e.g. 192.168.0.2:8080)") parser.add_argument("--proxy_socks5", "-ps", type=str, default=None, help="Set socks5 proxy (e.g. 192.168.0.2:1080)") # type is not supported for Baidu parser.add_argument("--type", "-ty", type=str, default=None, help="What kinds of images to download.", choices=["clipart", "linedrawing", "photograph"]) # Bing: color for colored images, bw for black&white images, other color contains Red, orange, yellow, green # Teal, Blue, Purple, Pink, Brown, Black, Gray, White # Baidu: white, bw, black, pink, blue, red, yellow, purple, green, teal, orange, brown # Google: bw, red, orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown parser.add_argument("--color", "-cl", type=str, default=None, help="Specify the color of desired images.") args = parser.parse_args(args=argv) proxy_type = None proxy = None if args.proxy_http is not None: proxy_type = "http" proxy = args.proxy_http elif args.proxy_socks5 is not None: proxy_type = "socks5" proxy = args.proxy_socks5 crawled_urls = crawler.crawl_image_urls(args.keywords, engine=args.engine, max_number=args.max_number, face_only=args.face_only, safe_mode=args.safe_mode, proxy_type=proxy_type, proxy=proxy, browser=args.driver, image_type=args.type, color=args.color) downloader.download_images(image_urls=crawled_urls, dst_dir=args.output, concurrency=args.num_threads, timeout=args.timeout, proxy_type=proxy_type, proxy=proxy, file_prefix=args.engine) print("Finished.")