Esempio n. 1
0
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords", type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver", "-d", type=str, default="chrome_headless",
                        help="Image search engine.", choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number", "-n", type=int, default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=50,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=False,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine, max_number=args.max_number,
                                            face_only=args.face_only, safe_mode=args.safe_mode,
                                            proxy_type=proxy_type, proxy=proxy,
                                            browser=args.driver)
    downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                               concurrency=args.num_threads, timeout=args.timeout,
                               proxy_type=proxy_type, proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")
Esempio n. 2
0
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    #parser.add_argument("keywords", type=str,
    #                    help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--max-number", "-n", type=int, default=50000,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=16,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=True,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default='web-proxy.tencent.com:8080',
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5
    for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']:
    #for _keywords in ['阅兵', '国庆大阅兵', '天安门阅兵', '队伍', '美国阅兵', '俄罗斯阅兵', '军队', '英国阅兵', '德国阅兵', '法国阅兵', '日本阅兵', '中国阅兵', '伊拉克阅兵']:

        for _engine in ["Google", "Bing", "Baidu"]:
            crawled_urls = crawler.crawl_image_urls(_keywords,
                                                    engine=_engine, max_number=args.max_number,
                                                    face_only=args.face_only, safe_mode=args.safe_mode,
                                                    proxy_type=proxy_type, proxy=proxy)
            downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                                       concurrency=args.num_threads, timeout=args.timeout,
                                       proxy_type=proxy_type, proxy=proxy,
                                       file_prefix=_engine + '_' + hashlib.md5(_keywords.encode()).hexdigest())

    print("Finished.")
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords", type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine", "-e", type=str, default="Google",
                        help="Image search engine.", choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--max-number", "-n", type=int, default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument("--num-threads", "-j", type=int, default=50,
                        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout", "-t", type=int, default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output", "-o", type=str, default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument("--safe-mode", "-S", action="store_true", default=False,
                        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only", "-F", action="store_true", default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http", "-ph", type=str, default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5", "-ps", type=str, default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine, max_number=args.max_number,
                                            face_only=args.face_only, safe_mode=args.safe_mode,
                                            proxy_type=proxy_type, proxy=proxy,
                                            browser="phantomjs")
    downloader.download_images(image_urls=crawled_urls, dst_dir=args.output,
                               concurrency=args.num_threads, timeout=args.timeout,
                               proxy_type=proxy_type, proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")
def run_keyword(keyword, args, proxy, proxy_type):
    crawled_urls = crawler.crawl_image_urls(keyword,
                                            engine=args.engine,
                                            max_number=args.max_number,
                                            face_only=args.face_only,
                                            safe_mode=args.safe_mode,
                                            proxy_type=proxy_type,
                                            proxy=proxy,
                                            browser=args.driver)

    dst_dir = os.path.join(args.output, keyword)
    downloader.download_images(image_urls=crawled_urls,
                               dst_dir=dst_dir,
                               concurrency=args.num_threads,
                               timeout=args.timeout,
                               proxy_type=proxy_type,
                               proxy=proxy,
                               file_prefix=args.engine)
Esempio n. 5
0
def fetch_image_urls(query: str,
                     limit: int = 20,
                     file_type: str = '',
                     filters: str = '',
                     extra_query_params: str = '') -> List[str]:
    result = list()
    keywords = query
    if len(file_type) > 0:
        keywords = query + " " + file_type
    urls = crawl_image_urls(keywords,
                            filters,
                            limit,
                            extra_query_params=extra_query_params)
    for url in urls:
        if isValidURL(url, file_type) and url not in result:
            result.append(url)
            if len(result) >= limit:
                break
    return result
def google_download(argv):

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords",
                        type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine",
                        "-e",
                        type=str,
                        default="Google",
                        help="Image search engine.",
                        choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver",
                        "-d",
                        type=str,
                        default="chrome_headless",
                        help="Image search engine.",
                        choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number",
                        "-n",
                        type=int,
                        default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument(
        "--num-threads",
        "-j",
        type=int,
        default=50,
        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout",
                        "-t",
                        type=int,
                        default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument(
        "--safe-mode",
        "-S",
        action="store_true",
        default=False,
        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--label-age",
                        "-l",
                        action="store_true",
                        default=True,
                        help="extract the age ")
    parser.add_argument("--birthdate",
                        "-B",
                        type=str,
                        default=None,
                        help="birthdate of the searched person")
    parser.add_argument("--face-only",
                        "-F",
                        action="store_true",
                        default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http",
                        "-ph",
                        type=str,
                        default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5",
                        "-ps",
                        type=str,
                        default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")

    args = parser.parse_args(args=argv)
    # argv = ['-e', 'Google', '-d', 'chrome_headless', '-n', '40', '-j', '10', '-o', 'img/google/kids10/Colin_Baiocchi', '-F', '-S', 'Colin Baiocchi']

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    if args.label_age and args.birthdate is None:
        raise RuntimeError("Birthdate is necessary if args.label_age is True")

    sleep_time = 2
    num_retires = 4

    for x in range(num_retires):
        """
        selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
        (unknown error: DevToolsActivePort file doesn't exist)
        (The process started from chrome location /usr/bin/chromium is no longer running, so ChromeDriver is assuming that Chrome has crashed.)

        """
        try:
            crawled_urls = crawler.crawl_image_urls(args.keywords,
                                                    engine=args.engine,
                                                    max_number=args.max_number,
                                                    face_only=args.face_only,
                                                    safe_mode=args.safe_mode,
                                                    proxy_type=proxy_type,
                                                    proxy=proxy,
                                                    browser=args.driver)
            downloader.download_images(image_urls=crawled_urls,
                                       dst_dir=args.output,
                                       concurrency=args.num_threads,
                                       timeout=args.timeout,
                                       proxy_type=proxy_type,
                                       proxy=proxy,
                                       file_prefix=args.keywords)
        except WebDriverException:
            sleep(sleep_time)
            pass
        else:
            break

    ageLabeler = ExifImageAgeLabeler()
    # dir = "Image-Downloader/download_images/google/kids10"
    files = os.listdir(args.output)
    # files = [file for file in files if os.path.isfile(file)]
    for fn in files:
        age, _ = ageLabeler.label_age(fn,
                                      birthdate_str=args.birthdate,
                                      image_dir=args.output)
        if age is not None:
            src = os.path.join(args.output, fn)
            imagename_with_age = os.path.splitext(fn)[0] + "|{}".format(
                age) + os.path.splitext(fn)[1]
            dst = os.path.join(args.output, imagename_with_age)
            os.rename(src, dst)
        else:
            src = os.path.join(args.output, fn)
            os.remove(src)

    logger.info("Finished.")

    return len(os.listdir(args.output))
Esempio n. 7
0
def main(argv):
    parser = argparse.ArgumentParser(description="Image Downloader")
    parser.add_argument("keywords",
                        type=str,
                        help='Keywords to search. ("in quotes")')
    parser.add_argument("--engine",
                        "-e",
                        type=str,
                        default="Google",
                        help="Image search engine.",
                        choices=["Google", "Bing", "Baidu"])
    parser.add_argument("--driver",
                        "-d",
                        type=str,
                        default="chrome_headless",
                        help="Image search engine.",
                        choices=["chrome_headless", "chrome", "phantomjs"])
    parser.add_argument("--max-number",
                        "-n",
                        type=int,
                        default=100,
                        help="Max number of images download for the keywords.")
    parser.add_argument(
        "--num-threads",
        "-j",
        type=int,
        default=50,
        help="Number of threads to concurrently download images.")
    parser.add_argument("--timeout",
                        "-t",
                        type=int,
                        default=20,
                        help="Seconds to timeout when download an image.")
    parser.add_argument("--output",
                        "-o",
                        type=str,
                        default="./download_images",
                        help="Output directory to save downloaded images.")
    parser.add_argument(
        "--safe-mode",
        "-S",
        action="store_true",
        default=False,
        help="Turn on safe search mode. (Only effective in Google)")
    parser.add_argument("--face-only",
                        "-F",
                        action="store_true",
                        default=False,
                        help="Only search for ")
    parser.add_argument("--proxy_http",
                        "-ph",
                        type=str,
                        default=None,
                        help="Set http proxy (e.g. 192.168.0.2:8080)")
    parser.add_argument("--proxy_socks5",
                        "-ps",
                        type=str,
                        default=None,
                        help="Set socks5 proxy (e.g. 192.168.0.2:1080)")
    # type is not supported for Baidu
    parser.add_argument("--type",
                        "-ty",
                        type=str,
                        default=None,
                        help="What kinds of images to download.",
                        choices=["clipart", "linedrawing", "photograph"])
    # Bing: color for colored images, bw for black&white images, other color contains Red, orange, yellow, green
    # Teal, Blue, Purple, Pink, Brown, Black, Gray, White
    # Baidu: white, bw, black, pink, blue, red, yellow, purple, green, teal, orange, brown
    # Google: bw, red, orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown
    parser.add_argument("--color",
                        "-cl",
                        type=str,
                        default=None,
                        help="Specify the color of desired images.")

    args = parser.parse_args(args=argv)

    proxy_type = None
    proxy = None
    if args.proxy_http is not None:
        proxy_type = "http"
        proxy = args.proxy_http
    elif args.proxy_socks5 is not None:
        proxy_type = "socks5"
        proxy = args.proxy_socks5

    crawled_urls = crawler.crawl_image_urls(args.keywords,
                                            engine=args.engine,
                                            max_number=args.max_number,
                                            face_only=args.face_only,
                                            safe_mode=args.safe_mode,
                                            proxy_type=proxy_type,
                                            proxy=proxy,
                                            browser=args.driver,
                                            image_type=args.type,
                                            color=args.color)
    downloader.download_images(image_urls=crawled_urls,
                               dst_dir=args.output,
                               concurrency=args.num_threads,
                               timeout=args.timeout,
                               proxy_type=proxy_type,
                               proxy=proxy,
                               file_prefix=args.engine)

    print("Finished.")