def test_bing(): print('start testing BingImageCrawler') bing_crawler = BingImageCrawler( downloader_threads=2, storage={'root_dir': 'images/bing'}, log_level=logging.INFO) search_filters = dict( type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('cat', max_num=10, filters=search_filters)
def test_bing(): img_dir = osp.join(test_dir, 'bing') bing_crawler = BingImageCrawler( downloader_threads=2, storage={'root_dir': img_dir}, log_level=logging.INFO) search_filters = dict( type='photo', license='commercial', layout='wide', size='large', date='pastmonth') bing_crawler.crawl('cat', max_num=5, filters=search_filters) shutil.rmtree(img_dir)
def crawl_keywords_save_folder(cls, name: str, keywords: List[str], base_dir: str = "tmp", filters: Dict = {"size": "large"}, max_num: int = 10000, train_ratio: float = 0.8): """キーワードでクロールしてデータセットを作成する。 Arguments: name {str} -- データセット名 keywords {List[str]} -- キーワード filter {[type]} -- [description] Keyword Arguments: dest_base_dir {str} -- [description] (default: {"tmp/crawl"}) train_ratio {float} -- [description] (default: {0.8}) """ download_base = base_dir + os.path.sep + name for k in keywords: download_dir = download_base + os.path.sep + "train" + os.path.sep + k if os.path.exists(download_dir) != True: os.makedirs(download_dir) storage = {"root_dir": download_dir} print("keyword:", k, " dir", download_dir) crawler = BingImageCrawler(storage=storage) crawler.crawl(keyword=k, filters=filters, max_num=max_num, file_idx_offset=0) move_dir = download_base + os.path.sep + "test" + os.path.sep + k if os.path.exists(move_dir) != True: os.makedirs(move_dir) file_list = glob.glob(download_dir + os.path.sep + "*.jpg") move_num: int = int(len(file_list) * train_ratio) move_list = file_list[move_num:] for f in move_list: shutil.move(f, move_dir)
def crawl(folder: str, search: str, maxnum:int, crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]: """Crawl web sites for images""" print('(1) Crawling ...') # prepare folders os.makedirs(folder, exist_ok=True) sources = {} if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f' -> {c}') if c == 'GOOGLE': google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': folder}) google_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200,200), max_size=None, file_idx_offset=0) if c == 'BING': bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={'root_dir': folder}) bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto') if c == 'BAIDU': baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={'root_dir': folder}) baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200,200), max_size=None, file_idx_offset='auto') return {k: v for k, v in CustomDownloader.registry.items() if k is not None}
def Query(query, verb, google=True, google_year=1, bing=True, baidu=True): SAVE_DIR = os.path.join(ROOT_DIR, verb) if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) # SAVE_DIR = os.path.join(ROOT_DIR, query) if google: google_path = os.path.join(SAVE_DIR, 'Google') if not os.path.exists(google_path): os.makedirs(google_path) google_crawler = GoogleImageCrawler(feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': google_path}) now_year = 2018 for past_year in range(google_year): from_year = now_year - past_year filters = dict(license='noncommercial,modify', date=((from_year, 1, 1), (from_year, 12, 30))) google_crawler.crawl(keyword=query, filters=filters, max_num=1000, file_idx_offset='auto') if bing: bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': os.path.join(SAVE_DIR, 'Bing')}) filters_bing = dict( # size='large', # color='orange', license='noncommercial,modify') bing_crawler.crawl(keyword=query, filters=filters_bing, offset=0, max_num=1000) if baidu: baidu_crawler = BaiduImageCrawler( storage={'root_dir': os.path.join(SAVE_DIR, 'Baidu')}) baidu_crawler.crawl(keyword=query, offset=0, max_num=1000)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
def crawel_auto(search_word, get_num): dir_name = "crawel" print("Googleのクローリングを開始しました。") # Google googleCrawler = GoogleImageCrawler( storage={"root_dir": f'{dir_name}/{search_word}'}, log_level=logging.CRITICAL) googleCrawler.crawl(keyword=search_word, max_num=get_num) #print("Baiduのクローリングを開始しました。") #Baidu #baiduCrawler = BaiduImageCrawler(storage={"root_dir": f'{dir_name}/{search_word}'}, log_level=logging.CRITICAL) #baiduCrawler.crawl(keyword=search_word, max_num=get_num, file_idx_offset=get_num) print("Bingのクローリングを開始しました。") #Bing bingCrawler = BingImageCrawler( storage={"root_dir": f'{dir_name}/{search_word}'}, log_level=logging.CRITICAL) bingCrawler.crawl(keyword=search_word, max_num=get_num, file_idx_offset=get_num * 2)
def main(args): if args.output_dir is None: raise ValueError('output dir must be assigned') os.makedirs(args.output_dir, exist_ok=True) root_dir = os.path.join(args.output_dir, args.search_keyword) os.makedirs(root_dir, exist_ok=True) crawler = None if args.search_engine == 'bing': crawler = BingImageCrawler(feeder_threads=2, parser_threads=2, downloader_threads=10, storage={'root_dir': root_dir}) crawler.crawl(keyword=args.search_keyword, filters=None, offset=0, max_num=args.number_of_image)
def init_crawler(path, crawler=None, nthreads=4): assert crawler != None, 'crawler is set as None.' if crawler in ['google']: m_crawler = GoogleImageCrawler(downloader_threads=nthreads, storage={'root_dir': path}, log_level=logging.INFO) elif crawler in ['bing']: m_crawler = BingImageCrawler(storage={'root_dir': path}, log_level=logging.INFO) elif crawler in ['baidu']: m_crawler = BaiduImageCrawler(downloader_threads=nthreads, storage={'root_dir': path}) return m_crawler
def _image_scraping(self, keyword: str, max_num: int, storage: str) -> None: """ bingより画像をダウンロードしローカルに保存する Args: keyword (str): ダウンロードしたい画像の検索ワード max_num (int): ダウンロード画像数 storage (str): ローカルの画像ダウンロード先 """ crawler: BingImageCrawler = BingImageCrawler(storage={"root_dir": storage}) crawler.crawl(keyword=keyword, max_num=max_num) # 負担軽減のため、3秒スリープする time.sleep(1)
def download_bing_images( entity: str, entity_type: str, download_folder: str, num_images: int, img_license: str, use_entity_type_query=False ): # create output folder if not os.path.exists(download_folder): os.makedirs(download_folder) # init crawler crawler = BingImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=4, downloader_cls=CostumDownloader, storage={"backend": "FileSystem", "root_dir": download_folder}, extra_downloader_args={ "entity_type": entity_type, "entity": entity, "root_dir": download_folder, "engine": "bing", "license": img_license, }, ) # specify search query if img_license == "noncommercial": filters = dict(type="photo", license="noncommercial") else: # license == 'all': filters = dict(type="photo") if use_entity_type_query: keyword = entity + " " + entity_type else: keyword = entity # crawl images crawler.crawl(keyword=keyword, max_num=num_images, filters=filters) return crawler.downloader.entity_dict
def exe_crawl(arg): # google_crawler = GoogleImageCrawler( # downloader_cls=PrefixNameGoogleDownloader, # feeder_threads=1, # parser_threads=1, # downloader_threads=4, # storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'}) filters = dict(license=f'{arg.license}') # google_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max, file_idx_offset=0) bing_crawler = BingImageCrawler( downloader_cls=PrefixNameBingDownloader, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'}) bing_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max) baidu_crawler = BaiduImageCrawler( downloader_cls=PrefixNameBaiduDownloader, storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'}) baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
CloudTypesList = open('SingleCloud.txt', 'r') for cloudTypesName in CloudTypesList: cloud_type = cloudTypesName.strip('\n') # cloud_type = "single cloud in the sky" # imageDir = image_path + '\\' + cloud_type print("image path--------------" + image_path) # # flicker crawing # flickr_crawler = FlickrImageCrawler(Flickr_API_Key, parser_threads=2, downloader_threads=4, storage={'root_dir': image_path}) # flickr_crawler.crawl(text=cloud_type, max_num=1000, tags=cloud_type) # google crawing google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': image_path}) google_crawler.crawl(keyword=cloud_type, max_num=1000, file_idx_offset='auto') # bing crawing bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': image_path}) bing_crawler.crawl(keyword=cloud_type, max_num=1000, file_idx_offset='auto') print("Image Collection is done")
def BingCrawl_multi_thread(SEARCH_WORD: str, WORKING_DIRECTORY: str, SEARCH_QT: int): '''Bing画像検索で画像を取得する(マルチスレッド) ''' # multiple threads crawler = BingImageCrawler(feeder_threads=4, parser_threads=4, downloader_threads=4, storage={"root_dir": WORKING_DIRECTORY}) # crawl crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'creativecommons'}, file_idx_offset=0) crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'publicdomain'}, file_idx_offset='auto') crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'noncommercial'}, file_idx_offset='auto') crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'commercial'}, file_idx_offset='auto') crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'noncommercial,modify'}, file_idx_offset='auto') crawler.crawl(keyword=SEARCH_WORD, max_num=SEARCH_QT, filters={'license': 'commercial,modify'}, file_idx_offset='auto')
from icrawler.builtin import BingImageCrawler models = ["Tesla Model 3", "Tesla Model S", "Tesla Model X", "Tesla Model Y"] for model in models: storage = {"backend": "FileSystem", "root_dir": "data/" + model} crawler = BingImageCrawler(storage=storage) crawler.crawl(keyword=model, max_num=1000)
# -*- coding: utf-8 -*- """bing_image_crawler.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1LbS3tAhjPKdVCPSNgFLh7yyNmwNrRHBs """ !pip install icrawler from icrawler.builtin import BingImageCrawler bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': 'C:/bus'} # value에 경로 입력 ) # filter bing_crawler.crawl(keyword='bus', filters=None, offset=0, max_num=1000) # keyword에 검색어, max_num = 최대 파일수
def switchgame_thumbnail_download_bing(switchgame_id : str, switchgame_title : str): if pf == 'Darwin': # ダウンロードしたサーバにあるフォルダを削除 if os.path.exists(Information.download_dir("Darwin") + str(switchgame_id) + "/"): shutil.rmtree(Information.download_dir("Darwin") + str(switchgame_id) + "/") # 保存先ディレクトリを指定 os.makedirs(str(switchgame_id), exist_ok=True) crawler = BingImageCrawler(storage={"root_dir": Information.download_dir("Darwin") + str(switchgame_id)}) print("1. Image Folder Remove Complete") # 画像検索のためのswitchソフトのタイトルを変換する switchgame_title = title_search_conv(switchgame_title) # switchソフトのタイトルを検索し、画像ダウンロード crawler.crawl(keyword=switchgame_title, max_num=7) print("2. File Server Upload Complete") print("000001.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000001.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 461), 461)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000001.jpg") print("000002.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000002.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000002.jpg") print("000003.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000003.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000003.jpg") print("000004.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000004.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000004.jpg") print("000005.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000005.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000005.jpg") print("000006.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000006.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000006.jpg") print("000007.jpgのリサイズ") img = Image.open(Information.download_dir("Darwin") + str(switchgame_id) + "/000007.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Darwin") + str(switchgame_id) + "/000007.jpg") print("3. Image Resize Complete") if pf == 'Linux': # ダウンロードしたサーバにあるフォルダを削除 if os.path.exists(Information.download_dir("Linux") + str(switchgame_id) + "/"): shutil.rmtree(Information.download_dir("Linux") + str(switchgame_id) + "/") # 保存先ディレクトリを指定 os.makedirs(str(switchgame_id), exist_ok=True) crawler = BingImageCrawler(storage={"root_dir": Information.download_dir("Linux") + str(switchgame_id)}) print("1. Image Folder Remove Complete") # 画像検索のためのswitchソフトのタイトルを変換する switchgame_title = title_search_conv(switchgame_title) # switchソフトのタイトルを検索し、画像ダウンロード crawler.crawl(keyword=switchgame_title, max_num=7) print("2. File Server Upload Complete") print("000001.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000001.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 461), 461)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000001.jpg") print("000002.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000002.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000002.jpg") print("000003.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000003.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000003.jpg") print("000004.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000004.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000004.jpg") print("000005.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000005.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000005.jpg") print("000006.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000006.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000006.jpg") print("000007.jpgのリサイズ") img = Image.open(Information.download_dir("Linux") + str(switchgame_id) + "/000007.jpg") img_resize = img.convert('RGB').resize((calc_image(img, 187), 187)) img_resize.save(Information.download_dir("Linux") + str(switchgame_id) + "/000007.jpg") print("3. Image Resize Complete") image_url = list() image_url.append("image_uploaded") return image_url
targetDir = baseDir + '/' + targetName if not os.path.isdir(baseDir): os.mkdir(baseDir) if not os.path.isdir(targetDir): os.mkdir(targetDir) for idx, img in enumerate(image): print(idx) imgSrc = img['data-source'] with urlopen(imgSrc, context=context) as f: with open(targetDir + '/' + targetName + '_' + str(idx) + '.jpg', 'wb') as h: image = f.read() h.write(image) if idx > 50: break print('Naver Crawling done.') # Bing Crawling bing_crawler = BingImageCrawler(feeder_threads=10, parser_threads=10, downloader_threads=10, storage={'root_dir': targetDir}) bing_crawler.session.verify = False filters = dict(type='photo') # only photo bing_crawler.crawl(keyword=targetName, min_size=(200, 200), filters=filters, max_num=searchNum, file_idx_offset='auto')
if engine in ('Google', 'google'): google_crawler = GoogleImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) google_crawler.crawl(keyword=(query), max_num=(num), file_idx_offset=0) elif engine in ('Bing', 'bing'): bing_crawler = BingImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) bing_crawler.crawl(keyword=(query), filters=None, offset=0, max_num=(num)) elif engine in ('Baidu', 'baidu'): baidu_crawler = BaiduImageCrawler(downloader_cls=MyImageDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'matches'}) # log_level=logging.INFO, # extra_downloader_args={'log_file': 'meta.txt'}) baidu_crawler.crawl(keyword=(query),
for keyword in keywords: save_path = 'D:/Korean Celeb Data/' + keyword google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': save_path + '/google'}) filters = dict(type="face") google_crawler.crawl(keyword=keyword, filters=filters, offset=0, max_num=1000, min_size=(200, 200), max_size=None, file_idx_offset=0) bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': save_path + '/bing'}) bing_crawler.crawl(keyword=keyword, filters=None, offset=0, max_num=1000) baidu_crawler = BaiduImageCrawler( storage={'root_dir': save_path + '/baidu'}) baidu_crawler.crawl(keyword=keyword, offset=0, max_num=1000, min_size=(200, 200), max_size=None)
google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': key_word}) google_crawler.crawl(keyword=key_word, offset=0, max_num=number, date_min=None, date_max=None, min_size=(200, 200), max_size=None) files = os.listdir(key_word) for f in files: os.rename(os.path.join(key_word, f), os.path.join(key_word, 'google' + f)) bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': key_word}) bing_crawler.crawl(keyword=key_word, offset=0, max_num=number, min_size=None, max_size=None) files = os.listdir(key_word) for f in files: if f[0] != 'g': os.rename(os.path.join(key_word, f), os.path.join(key_word, 'bing' + f)) files = os.listdir(key_word) for f in files:
from icrawler.builtin import BingImageCrawler for keyword in ['greyhound']: bing_crawler = BingImageCrawler( parser_threads=2, downloader_threads=4, storage={'root_dir': 'images/{}'.format(keyword)}) bing_crawler.crawl(keyword=keyword, max_num=1000, min_size=(200, 200))
from icrawler.builtin import BingImageCrawler crawler = BingImageCrawler(storage={"root_dir": "dogs"}) crawler.crawl(keyword="犬", max_num=10)
from icrawler.builtin import BingImageCrawler crawler = BingImageCrawler(storage={"root_dir": "mario_images"}) crawler.crawl(keyword="まりお流ラーメン", max_num=1000)
class PrefixNameDownloaderBing(ImageDownloader): prefix = 'bing' def get_filename(self, task, default_ext): filename = super(PrefixNameDownloaderBing, self).get_filename(task, default_ext) return self.prefix + '_' + filename list_names = getListNames() print(list_names) for name in list_names: google_crawler = GoogleImageCrawler( downloader_cls=PrefixNameDownloaderGoogle, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': 'images/' + name}) google_crawler.crawl(keyword=name, offset=0, max_num=1000, min_size=(200, 200), max_size=None, file_idx_offset=0) bing_crawler = BingImageCrawler(downloader_cls=PrefixNameDownloaderBing, downloader_threads=4, storage={'root_dir': 'images/' + name}) bing_crawler.crawl(keyword=name, filters=None, offset=0, max_num=1000)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ["GOOGLE", "BING", "BAIDU", "FLICKR"], ) -> Dict[str, str]: """Crawl web sites for images""" print("(1) Crawling ...") # prepare folders os.makedirs(folder, exist_ok=True) if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f" -> {c}") if c == "GOOGLE": google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, parser_cls=GoogleParser, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={"root_dir": folder}, ) google_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0, ) if c == "BING": bing_crawler = BingImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={"root_dir": folder}, ) bing_crawler.crawl( keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset="auto", ) if c == "BAIDU": baidu_crawler = BaiduImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) baidu_crawler.crawl( keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) if c == "FLICKR": flick_api_key = os.environ.get("FLICKR_API_KEY") if not flick_api_key: print( "Error: Flickr crawler requires FLICKR_API_KEY environment variable" " to be set with your non-secret API key.") exit(-1) flickr_crawler = FlickrImageCrawler( flick_api_key, downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={"root_dir": folder}, ) flickr_crawler.crawl( text=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset="auto", ) return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
def main(w): os.remove('static/tmp/000001.jpg') print('run') crawler = BingImageCrawler(storage={"root_dir": "static/tmp"}) crawler.crawl(keyword=w, max_num=1)
import os from icrawler.builtin import BingImageCrawler path = r'/home/wwgz-cbm/spider_img/testBing' f = open('starName.txt', 'r', encoding='utf-8') lines = f.readlines() for i, line in enumerate(lines): name = line.strip('\n') file_path = os.path.join(path, name) if not os.path.exists(file_path): os.makedirs(file_path) bing_storage = {'root_dir': file_path} bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage) bing_crawler.crawl(keyword=name, max_num=10) print('第{}位明星:{}'.format(i, name))
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler import json import os keywords_file = '../keywords/keywords_20170906.json' with open(keywords_file, 'r') as f: lines = json.load(f) for key, values in lines.items(): for keyword in values: bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': '../images/' + key}) bing_crawler.crawl(keyword=keyword, offset=0, max_num=1000, min_size=None, max_size=None) #baidu_crawler = BaiduImageCrawler(storage={'root_dir': './images'}) #baidu_crawler.crawl(keyword='sunny', offset=0, max_num=1000, # min_size=None, max_size=None)
from icrawler.builtin import BingImageCrawler import os, sys if(len(sys.argv) != 3): print("{0} <folder name> <search term>".format(sys.argv[0])) sys.exit(1) base_dir = os.path.abspath(str('images/' + sys.argv[1])) print("Crawling...") bing_crawler=BingImageCrawler(storage={'root_dir':str('images/' + sys.argv[1])}) bing_crawler.crawl(keyword=sys.argv[2], filters=None ,max_num=1000, offset=0) a = 0 print("Renaming...") for root, dirs, files in os.walk(base_dir): for file in files: if(file.endswith("png") or file.endswith("jpg")): a += 1 path = os.path.join(root, file) ext = os.path.splitext(file)[1] newpath = os.path.dirname(path) + "/" + str(a) + ext os.rename(path, newpath)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Jul 17 10:01:38 2020 @author: hajime.b """ from icrawler.builtin import BingImageCrawler crawler = BingImageCrawler( storage={"root_dir": "/Users/hajime.b/Documents/animals/dogs"}) crawler.crawl(keyword="犬", max_num=100) crawler = BingImageCrawler( storage={"root_dir": "/Users/hajime.b/Documents/animals/cats"}) crawler.crawl(keyword="猫", max_num=100) crawler = BingImageCrawler( storage={"root_dir": "/Users/hajime.b/Documents/animals/gorillas"}) crawler.crawl(keyword="ゴリラ", max_num=100) crawler = BingImageCrawler( storage={"root_dir": "/Users/hajime.b/Documents/animals/Giraffes"}) crawler.crawl(keyword="キリン", max_num=100) crawler = BingImageCrawler( storage={"root_dir": "/Users/hajime.b/Documents/animals/Lions"}) crawler.crawl(keyword="ライオン", max_num=100)
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler import os save_dir = 'car_picture' if not os.path.isdir('./downloads/%s' % save_dir): os.mkdir('./downloads/%s' % save_dir) for keyword in ['car', '차']: filters = dict(size='>320x320', type='photo') bing_crawler = BingImageCrawler( downloader_threads=4, storage={'root_dir': './downloads/%s' % save_dir}) bing_crawler.crawl(keyword=keyword, filters=filters, offset=0, max_num=3000, file_idx_offset='auto')
import dbutility import pickle import os while 1: c, conn = dbutility.create_connection() c.execute('SELECT * FROM scrapper WHERE status=? ', ('created', )) data = c.fetchall() # print(data) for task in data: dbutility.update_status_scrapper(task[0], 'started') classes = pickle.loads(task[3]) for c in classes: bing_crawler = BingImageCrawler( downloader_threads=6, storage={'root_dir': f'static/datasets/{task[1]}/{c}'}) bing_crawler.crawl(keyword=c, filters=None, offset=0, max_num=int(task[2])) num_images = 0 num_classes = 0 dataset_path = f'static/datasets/{task[1]}' for clx in os.listdir(dataset_path): num_classes += 1 num_images += len(os.listdir(os.path.join(dataset_path, clx))) dbutility.update_status_scrapper(task[1], 'completed') dbutility.insert_new_dataset(task[1], num_classes, num_images) conn.close()