Esempio n. 1
0
def test_bing():
    print('start testing BingImageCrawler')
    bing_crawler = BingImageCrawler(
        downloader_threads=2,
        storage={'root_dir': 'images/bing'},
        log_level=logging.INFO)
    search_filters = dict(
        type='photo',
        license='commercial',
        layout='wide',
        size='large',
        date='pastmonth')
    bing_crawler.crawl('cat', max_num=10, filters=search_filters)
Esempio n. 2
0
def test_bing():
    img_dir = osp.join(test_dir, 'bing')
    bing_crawler = BingImageCrawler(
        downloader_threads=2,
        storage={'root_dir': img_dir},
        log_level=logging.INFO)
    search_filters = dict(
        type='photo',
        license='commercial',
        layout='wide',
        size='large',
        date='pastmonth')
    bing_crawler.crawl('cat', max_num=5, filters=search_filters)
    shutil.rmtree(img_dir)
Esempio n. 3
0
    def crawl_keywords_save_folder(cls,
                                   name: str,
                                   keywords: List[str],
                                   base_dir: str = "tmp",
                                   filters: Dict = {"size": "large"},
                                   max_num: int = 10000,
                                   train_ratio: float = 0.8):
        """キーワードでクロールしてデータセットを作成する。
            Arguments:
            name {str} -- データセット名
            keywords {List[str]} -- キーワード
            filter {[type]} -- [description]

        Keyword Arguments:
            dest_base_dir {str} -- [description] (default: {"tmp/crawl"})
            train_ratio {float} -- [description] (default: {0.8})
        """
        download_base = base_dir + os.path.sep + name
        for k in keywords:
            download_dir = download_base + os.path.sep + "train" + os.path.sep + k
            if os.path.exists(download_dir) != True:
                os.makedirs(download_dir)
            storage = {"root_dir": download_dir}

            print("keyword:", k, " dir", download_dir)

            crawler = BingImageCrawler(storage=storage)
            crawler.crawl(keyword=k,
                          filters=filters,
                          max_num=max_num,
                          file_idx_offset=0)

            move_dir = download_base + os.path.sep + "test" + os.path.sep + k
            if os.path.exists(move_dir) != True:
                os.makedirs(move_dir)

            file_list = glob.glob(download_dir + os.path.sep + "*.jpg")
            move_num: int = int(len(file_list) * train_ratio)

            move_list = file_list[move_num:]
            for f in move_list:
                shutil.move(f, move_dir)
Esempio n. 4
0
def crawl(folder: str, search: str, maxnum:int, crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]:
    """Crawl web sites for images"""
    print('(1) Crawling ...')
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    sources = {}
    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f'    -> {c}')
        if c == 'GOOGLE':
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={'root_dir': folder})

            google_crawler.crawl(keyword=search, offset=0, max_num=maxnum,
                                min_size=(200,200), max_size=None, file_idx_offset=0)

        if c == 'BING':
            bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader,
                                            log_level=logging.CRITICAL,
                                            downloader_threads=4,
                                            storage={'root_dir': folder})
            bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto')


        if c == 'BAIDU':
            baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader,
                                    log_level=logging.CRITICAL,
                                    storage={'root_dir': folder})
            baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum,
                                min_size=(200,200), max_size=None, file_idx_offset='auto')


    return {k: v for k, v in CustomDownloader.registry.items() if k is not None}
Esempio n. 5
0
def Query(query, verb, google=True, google_year=1, bing=True, baidu=True):
    SAVE_DIR = os.path.join(ROOT_DIR, verb)
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    # SAVE_DIR = os.path.join(ROOT_DIR, query)
    if google:
        google_path = os.path.join(SAVE_DIR, 'Google')
        if not os.path.exists(google_path):
            os.makedirs(google_path)
        google_crawler = GoogleImageCrawler(feeder_threads=1,
                                            parser_threads=1,
                                            downloader_threads=4,
                                            storage={'root_dir': google_path})
        now_year = 2018
        for past_year in range(google_year):
            from_year = now_year - past_year
            filters = dict(license='noncommercial,modify',
                           date=((from_year, 1, 1), (from_year, 12, 30)))
            google_crawler.crawl(keyword=query,
                                 filters=filters,
                                 max_num=1000,
                                 file_idx_offset='auto')

    if bing:
        bing_crawler = BingImageCrawler(
            downloader_threads=4,
            storage={'root_dir': os.path.join(SAVE_DIR, 'Bing')})
        filters_bing = dict(
            # size='large',
            # color='orange',
            license='noncommercial,modify')
        bing_crawler.crawl(keyword=query,
                           filters=filters_bing,
                           offset=0,
                           max_num=1000)

    if baidu:
        baidu_crawler = BaiduImageCrawler(
            storage={'root_dir': os.path.join(SAVE_DIR, 'Baidu')})
        baidu_crawler.crawl(keyword=query, offset=0, max_num=1000)
Esempio n. 6
0
def getImg(keywords='', dirpath='', amount=0, source=4):
    if source == 1:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)

    elif source == 2:
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    elif source == 3:
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)

    else:
        print('\n--- 開始從「Google 圖片」下載---\n')
        google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath})
        google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None)
        print('\n--- 開始從「Microsoft Bing」下載---\n')
        bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath})
        bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
        print('\n--- 開始從「百度」下載---\n')
        baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath})
        baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
Esempio n. 7
0
def crawel_auto(search_word, get_num):
    dir_name = "crawel"
    print("Googleのクローリングを開始しました。")
    # Google
    googleCrawler = GoogleImageCrawler(
        storage={"root_dir": f'{dir_name}/{search_word}'},
        log_level=logging.CRITICAL)
    googleCrawler.crawl(keyword=search_word, max_num=get_num)

    #print("Baiduのクローリングを開始しました。")
    #Baidu
    #baiduCrawler = BaiduImageCrawler(storage={"root_dir": f'{dir_name}/{search_word}'}, log_level=logging.CRITICAL)
    #baiduCrawler.crawl(keyword=search_word, max_num=get_num, file_idx_offset=get_num)

    print("Bingのクローリングを開始しました。")
    #Bing
    bingCrawler = BingImageCrawler(
        storage={"root_dir": f'{dir_name}/{search_word}'},
        log_level=logging.CRITICAL)
    bingCrawler.crawl(keyword=search_word,
                      max_num=get_num,
                      file_idx_offset=get_num * 2)
def main(args):

    if args.output_dir is None:
        raise ValueError('output dir must be assigned')

    os.makedirs(args.output_dir, exist_ok=True)

    root_dir = os.path.join(args.output_dir, args.search_keyword)

    os.makedirs(root_dir, exist_ok=True)

    crawler = None
    if args.search_engine == 'bing':
        crawler = BingImageCrawler(feeder_threads=2,
                                   parser_threads=2,
                                   downloader_threads=10,
                                   storage={'root_dir': root_dir})

    crawler.crawl(keyword=args.search_keyword,
                  filters=None,
                  offset=0,
                  max_num=args.number_of_image)
Esempio n. 9
0
def init_crawler(path, crawler=None, nthreads=4):
    assert crawler != None, 'crawler is set as None.'
    if crawler in ['google']:
        m_crawler = GoogleImageCrawler(downloader_threads=nthreads,
                                       storage={'root_dir': path},
                                       log_level=logging.INFO)
    elif crawler in ['bing']:
        m_crawler = BingImageCrawler(storage={'root_dir': path},
                                     log_level=logging.INFO)
    elif crawler in ['baidu']:
        m_crawler = BaiduImageCrawler(downloader_threads=nthreads,
                                      storage={'root_dir': path})
    return m_crawler
    def _image_scraping(self, keyword: str, max_num: int, storage: str) -> None:
       """
       bingより画像をダウンロードしローカルに保存する

       Args:
           keyword (str): ダウンロードしたい画像の検索ワード
           max_num (int): ダウンロード画像数
           storage (str): ローカルの画像ダウンロード先
       """
       crawler: BingImageCrawler = BingImageCrawler(storage={"root_dir": storage})
       crawler.crawl(keyword=keyword, max_num=max_num)
       # 負担軽減のため、3秒スリープする
       time.sleep(1)
Esempio n. 11
0
def download_bing_images(
    entity: str, entity_type: str, download_folder: str, num_images: int, img_license: str, use_entity_type_query=False
):

    # create output folder
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    # init crawler
    crawler = BingImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        downloader_cls=CostumDownloader,
        storage={"backend": "FileSystem", "root_dir": download_folder},
        extra_downloader_args={
            "entity_type": entity_type,
            "entity": entity,
            "root_dir": download_folder,
            "engine": "bing",
            "license": img_license,
        },
    )

    # specify search query
    if img_license == "noncommercial":
        filters = dict(type="photo", license="noncommercial")
    else:  # license == 'all':
        filters = dict(type="photo")

    if use_entity_type_query:
        keyword = entity + " " + entity_type
    else:
        keyword = entity

    # crawl images
    crawler.crawl(keyword=keyword, max_num=num_images, filters=filters)
    return crawler.downloader.entity_dict
Esempio n. 12
0
def exe_crawl(arg):
    # google_crawler = GoogleImageCrawler(
    #     downloader_cls=PrefixNameGoogleDownloader,
    #     feeder_threads=1,
    #     parser_threads=1,
    #     downloader_threads=4,
    #     storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'})
    filters = dict(license=f'{arg.license}')
    # google_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max, file_idx_offset=0)

    bing_crawler = BingImageCrawler(
        downloader_cls=PrefixNameBingDownloader,
        downloader_threads=4,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'})
    bing_crawler.crawl(keyword=f'{arg.keyword}',
                       filters=filters,
                       offset=0,
                       max_num=arg.max)

    baidu_crawler = BaiduImageCrawler(
        downloader_cls=PrefixNameBaiduDownloader,
        storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'})
    baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
CloudTypesList = open('SingleCloud.txt', 'r')

for cloudTypesName in CloudTypesList:
    cloud_type = cloudTypesName.strip('\n')
    # cloud_type = "single cloud in the sky"
    # imageDir = image_path + '\\' + cloud_type
    print("image path--------------" + image_path)

    # # flicker crawing
    # flickr_crawler = FlickrImageCrawler(Flickr_API_Key, parser_threads=2, downloader_threads=4, storage={'root_dir': image_path})
    # flickr_crawler.crawl(text=cloud_type, max_num=1000, tags=cloud_type)

    # google crawing
    google_crawler = GoogleImageCrawler(parser_threads=2,
                                        downloader_threads=4,
                                        storage={'root_dir': image_path})
    google_crawler.crawl(keyword=cloud_type,
                         max_num=1000,
                         file_idx_offset='auto')

    # bing crawing
    bing_crawler = BingImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage={'root_dir': image_path})
    bing_crawler.crawl(keyword=cloud_type,
                       max_num=1000,
                       file_idx_offset='auto')

print("Image Collection is done")
Esempio n. 14
0
def BingCrawl_multi_thread(SEARCH_WORD: str, WORKING_DIRECTORY: str,
                           SEARCH_QT: int):
    '''Bing画像検索で画像を取得する(マルチスレッド)
    '''
    # multiple threads
    crawler = BingImageCrawler(feeder_threads=4,
                               parser_threads=4,
                               downloader_threads=4,
                               storage={"root_dir": WORKING_DIRECTORY})
    # crawl
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'creativecommons'},
                  file_idx_offset=0)
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'publicdomain'},
                  file_idx_offset='auto')
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'noncommercial'},
                  file_idx_offset='auto')
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'commercial'},
                  file_idx_offset='auto')
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'noncommercial,modify'},
                  file_idx_offset='auto')
    crawler.crawl(keyword=SEARCH_WORD,
                  max_num=SEARCH_QT,
                  filters={'license': 'commercial,modify'},
                  file_idx_offset='auto')
Esempio n. 15
0
from icrawler.builtin import BingImageCrawler

models = ["Tesla Model 3", "Tesla Model S", "Tesla Model X", "Tesla Model Y"]

for model in models:
    storage = {"backend": "FileSystem", "root_dir": "data/" + model}
    crawler = BingImageCrawler(storage=storage)
    crawler.crawl(keyword=model, max_num=1000)
Esempio n. 16
0
# -*- coding: utf-8 -*-
"""bing_image_crawler.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1LbS3tAhjPKdVCPSNgFLh7yyNmwNrRHBs
"""

!pip install icrawler

from icrawler.builtin import BingImageCrawler

bing_crawler = BingImageCrawler(
    downloader_threads=4,
    storage={'root_dir': 'C:/bus'} # value에 경로 입력
)
# filter

bing_crawler.crawl(keyword='bus', filters=None, offset=0, max_num=1000) # keyword에 검색어, max_num = 최대 파일수

def switchgame_thumbnail_download_bing(switchgame_id : str, switchgame_title : str):

    if pf == 'Darwin':

        # ダウンロードしたサーバにあるフォルダを削除
        if os.path.exists(Information.download_dir("Darwin") + str(switchgame_id) + "/"):
            shutil.rmtree(Information.download_dir("Darwin") + str(switchgame_id) + "/")

        # 保存先ディレクトリを指定
        os.makedirs(str(switchgame_id), exist_ok=True)
        crawler = BingImageCrawler(storage={"root_dir":  Information.download_dir("Darwin") + str(switchgame_id)})
        print("1. Image Folder Remove Complete")

        # 画像検索のためのswitchソフトのタイトルを変換する
        switchgame_title = title_search_conv(switchgame_title)

        # switchソフトのタイトルを検索し、画像ダウンロード
        crawler.crawl(keyword=switchgame_title, max_num=7)
        print("2. File Server Upload Complete")

        print("000001.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000001.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 461), 461))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000001.jpg")
        print("000002.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000002.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000002.jpg")
        print("000003.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000003.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000003.jpg")
        print("000004.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000004.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000004.jpg")
        print("000005.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000005.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000005.jpg")
        print("000006.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000006.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000006.jpg")
        print("000007.jpgのリサイズ")
        img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000007.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000007.jpg")

        print("3. Image Resize Complete")



    if pf == 'Linux':

        # ダウンロードしたサーバにあるフォルダを削除
        if os.path.exists(Information.download_dir("Linux") + str(switchgame_id) + "/"):
            shutil.rmtree(Information.download_dir("Linux") + str(switchgame_id) + "/")

        # 保存先ディレクトリを指定
        os.makedirs(str(switchgame_id), exist_ok=True)
        crawler = BingImageCrawler(storage={"root_dir":  Information.download_dir("Linux") + str(switchgame_id)})
        print("1. Image Folder Remove Complete")

        # 画像検索のためのswitchソフトのタイトルを変換する
        switchgame_title = title_search_conv(switchgame_title)

        # switchソフトのタイトルを検索し、画像ダウンロード
        crawler.crawl(keyword=switchgame_title, max_num=7)
        print("2. File Server Upload Complete")

        print("000001.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000001.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 461), 461))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000001.jpg")
        print("000002.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000002.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000002.jpg")
        print("000003.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000003.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000003.jpg")
        print("000004.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000004.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000004.jpg")
        print("000005.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000005.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000005.jpg")
        print("000006.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000006.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000006.jpg")
        print("000007.jpgのリサイズ")
        img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000007.jpg")
        img_resize = img.convert('RGB').resize((calc_image(img, 187), 187))
        img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000007.jpg")

        print("3. Image Resize Complete")


    image_url = list()
    image_url.append("image_uploaded")

    return image_url
Esempio n. 18
0
targetDir = baseDir + '/' + targetName

if not os.path.isdir(baseDir):
    os.mkdir(baseDir)
if not os.path.isdir(targetDir):
    os.mkdir(targetDir)

for idx, img in enumerate(image):
    print(idx)
    imgSrc = img['data-source']
    with urlopen(imgSrc, context=context) as f:
        with open(targetDir + '/' + targetName + '_' + str(idx) + '.jpg',
                  'wb') as h:
            image = f.read()
            h.write(image)
    if idx > 50: break
print('Naver Crawling done.')

# Bing Crawling
bing_crawler = BingImageCrawler(feeder_threads=10,
                                parser_threads=10,
                                downloader_threads=10,
                                storage={'root_dir': targetDir})
bing_crawler.session.verify = False
filters = dict(type='photo')  # only photo
bing_crawler.crawl(keyword=targetName,
                   min_size=(200, 200),
                   filters=filters,
                   max_num=searchNum,
                   file_idx_offset='auto')
Esempio n. 19
0

if engine in ('Google', 'google'):
    google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader,
                                        feeder_threads=1,
                                        parser_threads=1,
                                        downloader_threads=4,
                                        storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    google_crawler.crawl(keyword=(query), max_num=(num), file_idx_offset=0)

elif engine in ('Bing', 'bing'):
    bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader,
                                    feeder_threads=1,
                                    parser_threads=1,
                                    downloader_threads=4,
                                    storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    bing_crawler.crawl(keyword=(query), filters=None, offset=0, max_num=(num))

elif engine in ('Baidu', 'baidu'):
    baidu_crawler = BaiduImageCrawler(downloader_cls=MyImageDownloader,
                                      feeder_threads=1,
                                      parser_threads=1,
                                      downloader_threads=4,
                                      storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    baidu_crawler.crawl(keyword=(query),
Esempio n. 20
0
for keyword in keywords:

    save_path = 'D:/Korean Celeb Data/' + keyword

    google_crawler = GoogleImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': save_path + '/google'})

    filters = dict(type="face")

    google_crawler.crawl(keyword=keyword,
                         filters=filters,
                         offset=0,
                         max_num=1000,
                         min_size=(200, 200),
                         max_size=None,
                         file_idx_offset=0)

    bing_crawler = BingImageCrawler(downloader_threads=4,
                                    storage={'root_dir': save_path + '/bing'})
    bing_crawler.crawl(keyword=keyword, filters=None, offset=0, max_num=1000)

    baidu_crawler = BaiduImageCrawler(
        storage={'root_dir': save_path + '/baidu'})
    baidu_crawler.crawl(keyword=keyword,
                        offset=0,
                        max_num=1000,
                        min_size=(200, 200),
                        max_size=None)
Esempio n. 21
0
google_crawler = GoogleImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage={'root_dir': key_word})
google_crawler.crawl(keyword=key_word,
                     offset=0,
                     max_num=number,
                     date_min=None,
                     date_max=None,
                     min_size=(200, 200),
                     max_size=None)

files = os.listdir(key_word)
for f in files:
    os.rename(os.path.join(key_word, f), os.path.join(key_word, 'google' + f))

bing_crawler = BingImageCrawler(downloader_threads=4,
                                storage={'root_dir': key_word})
bing_crawler.crawl(keyword=key_word,
                   offset=0,
                   max_num=number,
                   min_size=None,
                   max_size=None)

files = os.listdir(key_word)
for f in files:
    if f[0] != 'g':
        os.rename(os.path.join(key_word, f),
                  os.path.join(key_word, 'bing' + f))

files = os.listdir(key_word)

for f in files:
from icrawler.builtin import BingImageCrawler
for keyword in ['greyhound']:
    bing_crawler = BingImageCrawler(
        parser_threads=2,
        downloader_threads=4,
        storage={'root_dir': 'images/{}'.format(keyword)})
    bing_crawler.crawl(keyword=keyword, max_num=1000, min_size=(200, 200))
Esempio n. 23
0
from icrawler.builtin import BingImageCrawler
crawler = BingImageCrawler(storage={"root_dir": "dogs"})
crawler.crawl(keyword="犬", max_num=10)
Esempio n. 24
0
from icrawler.builtin import BingImageCrawler
crawler = BingImageCrawler(storage={"root_dir": "mario_images"})
crawler.crawl(keyword="まりお流ラーメン", max_num=1000)
Esempio n. 25
0
class PrefixNameDownloaderBing(ImageDownloader):
    prefix = 'bing'

    def get_filename(self, task, default_ext):
        filename = super(PrefixNameDownloaderBing,
                         self).get_filename(task, default_ext)
        return self.prefix + '_' + filename


list_names = getListNames()
print(list_names)
for name in list_names:
    google_crawler = GoogleImageCrawler(
        downloader_cls=PrefixNameDownloaderGoogle,
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=4,
        storage={'root_dir': 'images/' + name})

    google_crawler.crawl(keyword=name,
                         offset=0,
                         max_num=1000,
                         min_size=(200, 200),
                         max_size=None,
                         file_idx_offset=0)

    bing_crawler = BingImageCrawler(downloader_cls=PrefixNameDownloaderBing,
                                    downloader_threads=4,
                                    storage={'root_dir': 'images/' + name})
    bing_crawler.crawl(keyword=name, filters=None, offset=0, max_num=1000)
Esempio n. 26
0
def crawl(
    folder: str,
    search: str,
    maxnum: int,
    crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"],
) -> Dict[str, str]:
    """Crawl web sites for images"""
    print("(1) Crawling ...")
    # prepare folders
    os.makedirs(folder, exist_ok=True)

    if maxnum > 1000:
        print("Max num limited to 1000")
        maxnum = 1000

    for c in crawlers:
        print(f"    -> {c}")
        if c == "GOOGLE":
            google_crawler = GoogleImageCrawler(
                downloader_cls=CustomDownloader,
                parser_cls=GoogleParser,
                log_level=logging.CRITICAL,
                feeder_threads=1,
                parser_threads=1,
                downloader_threads=4,
                storage={"root_dir": folder},
            )

            google_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset=0,
            )

        if c == "BING":
            bing_crawler = BingImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                downloader_threads=4,
                storage={"root_dir": folder},
            )
            bing_crawler.crawl(
                keyword=search,
                filters=None,
                offset=0,
                max_num=maxnum,
                file_idx_offset="auto",
            )

        if c == "BAIDU":
            baidu_crawler = BaiduImageCrawler(
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            baidu_crawler.crawl(
                keyword=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

        if c == "FLICKR":
            flick_api_key = os.environ.get("FLICKR_API_KEY")
            if not flick_api_key:
                print(
                    "Error: Flickr crawler requires FLICKR_API_KEY environment variable"
                    " to be set with your non-secret API key.")
                exit(-1)

            flickr_crawler = FlickrImageCrawler(
                flick_api_key,
                downloader_cls=CustomDownloader,
                log_level=logging.CRITICAL,
                storage={"root_dir": folder},
            )
            flickr_crawler.crawl(
                text=search,
                offset=0,
                max_num=maxnum,
                min_size=(200, 200),
                max_size=None,
                file_idx_offset="auto",
            )

    return {
        k: v
        for k, v in CustomDownloader.registry.items() if k is not None
    }
Esempio n. 27
0
def main(w):
    os.remove('static/tmp/000001.jpg')
    
    print('run')
    crawler = BingImageCrawler(storage={"root_dir": "static/tmp"})
    crawler.crawl(keyword=w, max_num=1)
Esempio n. 28
0
import os
from icrawler.builtin import BingImageCrawler

path = r'/home/wwgz-cbm/spider_img/testBing'
f = open('starName.txt', 'r', encoding='utf-8')
lines = f.readlines()
for i, line in enumerate(lines):
    name = line.strip('\n')
    file_path = os.path.join(path, name)
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    bing_storage = {'root_dir': file_path}
    bing_crawler = BingImageCrawler(parser_threads=2,
                                    downloader_threads=4,
                                    storage=bing_storage)
    bing_crawler.crawl(keyword=name, max_num=10)
    print('第{}位明星:{}'.format(i, name))
Esempio n. 29
0
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler
import json
import os

keywords_file = '../keywords/keywords_20170906.json'
with open(keywords_file, 'r') as f:
    lines = json.load(f)
    for key, values in lines.items():
        for keyword in values:
            bing_crawler = BingImageCrawler(
                downloader_threads=4, storage={'root_dir': '../images/' + key})
            bing_crawler.crawl(keyword=keyword,
                               offset=0,
                               max_num=1000,
                               min_size=None,
                               max_size=None)

#baidu_crawler = BaiduImageCrawler(storage={'root_dir': './images'})
#baidu_crawler.crawl(keyword='sunny', offset=0, max_num=1000,
#                   min_size=None, max_size=None)
Esempio n. 30
0
from icrawler.builtin import BingImageCrawler
import os, sys

if(len(sys.argv) != 3):
   print("{0} <folder name> <search term>".format(sys.argv[0]))
   sys.exit(1)

base_dir = os.path.abspath(str('images/' + sys.argv[1]))

print("Crawling...")
bing_crawler=BingImageCrawler(storage={'root_dir':str('images/' + sys.argv[1])})
bing_crawler.crawl(keyword=sys.argv[2], filters=None ,max_num=1000, offset=0)

a = 0

print("Renaming...")
for root, dirs, files in os.walk(base_dir):
      for file in files:
         if(file.endswith("png") or file.endswith("jpg")):
             a += 1
             path = os.path.join(root, file)
             ext = os.path.splitext(file)[1]
             newpath = os.path.dirname(path) + "/" + str(a) + ext
             os.rename(path, newpath)
Esempio n. 31
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 17 10:01:38 2020

@author: hajime.b
"""

from icrawler.builtin import BingImageCrawler
crawler = BingImageCrawler(
    storage={"root_dir": "/Users/hajime.b/Documents/animals/dogs"})
crawler.crawl(keyword="犬", max_num=100)

crawler = BingImageCrawler(
    storage={"root_dir": "/Users/hajime.b/Documents/animals/cats"})
crawler.crawl(keyword="猫", max_num=100)

crawler = BingImageCrawler(
    storage={"root_dir": "/Users/hajime.b/Documents/animals/gorillas"})
crawler.crawl(keyword="ゴリラ", max_num=100)

crawler = BingImageCrawler(
    storage={"root_dir": "/Users/hajime.b/Documents/animals/Giraffes"})
crawler.crawl(keyword="キリン", max_num=100)

crawler = BingImageCrawler(
    storage={"root_dir": "/Users/hajime.b/Documents/animals/Lions"})
crawler.crawl(keyword="ライオン", max_num=100)
Esempio n. 32
0
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler
import os

save_dir = 'car_picture'

if not os.path.isdir('./downloads/%s' % save_dir):
    os.mkdir('./downloads/%s' % save_dir)

for keyword in ['car', '차']:
    filters = dict(size='>320x320', type='photo')

    bing_crawler = BingImageCrawler(
        downloader_threads=4,
        storage={'root_dir': './downloads/%s' % save_dir})
    bing_crawler.crawl(keyword=keyword,
                       filters=filters,
                       offset=0,
                       max_num=3000,
                       file_idx_offset='auto')
import dbutility
import pickle
import os

while 1:
    c, conn = dbutility.create_connection()
    c.execute('SELECT * FROM scrapper WHERE status=? ', ('created', ))
    data = c.fetchall()
    # print(data)
    for task in data:
        dbutility.update_status_scrapper(task[0], 'started')
        classes = pickle.loads(task[3])

        for c in classes:
            bing_crawler = BingImageCrawler(
                downloader_threads=6,
                storage={'root_dir': f'static/datasets/{task[1]}/{c}'})
            bing_crawler.crawl(keyword=c,
                               filters=None,
                               offset=0,
                               max_num=int(task[2]))
        num_images = 0
        num_classes = 0
        dataset_path = f'static/datasets/{task[1]}'
        for clx in os.listdir(dataset_path):
            num_classes += 1
            num_images += len(os.listdir(os.path.join(dataset_path, clx)))
        dbutility.update_status_scrapper(task[1], 'completed')
        dbutility.insert_new_dataset(task[1], num_classes, num_images)
    conn.close()