def claw(self, keyword, size=10): start =time.time() google_crawler = GoogleImageCrawler(storage={'root_dir': '/home/zluo/food/' + keyword}) google_crawler.crawl(keyword=keyword + ' dishes', max_num=size) end = time.time() print(end - start)
def get_image(name, file_path, data_count, sample_filter = None): crawler = GoogleImageCrawler(storage = {"root_dir" : file_path + "/train" +"/" + name}) filters = dict( size = "large", type = "photo" ) # クローリングの実行 crawler.crawl(keyword=name, filters = sample_filter, max_num = data_count) # valディレクトリの作成 if os.path.isdir(file_path + "/val" + "/" + name): shutil.rmtree(file_path + "/val"+ "/" +name) os.makedirs(file_path + "/val" +"/" + name) # ダウンロードファイルのリストを作成 filelist = glob.glob(file_path + "/train" + "/" + name + "/*") # 訓練データの2割りをvalデータとして保存 ration = 0.2 val_files = random.sample(filelist, int(len(filelist)*ration)) for line in val_files: shutil.move(line, file_path + "/val" + "/" + name)
def crawl_image(keyword, max_num): try: crawler = GoogleImageCrawler(storage={'root_dir': 'images'}) crawler.crawl(keyword=keyword, max_num=int(max_num)) var.set('Crawling is done.') except ValueError: var.set('Maxinum number of images \n must be integer.')
def _download(query, dir, amount_to_crawl): """ Download ``amount_to_crawl`` images from Google Image Serach for query ``query``, save in directory ``dir`` :param query: Search query for Google Image Search :param dir: Directory to save the results in :param amount_to_crawl: Number of pictures to crawl """ intlen = len(str(amount_to_crawl)) google_crawler = GoogleImageCrawler(feeder_threads=10, parser_threads=1, log_level=100, downloader_threads=10, storage={'root_dir': dir}) end_date = date.today() amount_crawled = 0 while amount_crawled < amount_to_crawl > 0: crawling = min(max_per_iteration, amount_to_crawl - amount_crawled) date_filter = _get_date_filter(end_date) log( f'{get_progress((amount_crawled + crawling) / amount_to_crawl)} ' f'crawling images ' f'{amount_crawled:0{intlen}d} - {(amount_crawled + crawling):0{intlen}d} / {amount_to_crawl} ' f'for \'{query}\'' f' in daterange {date_filter}', end='\r') google_crawler.crawl(keyword=query, filters={'date': date_filter}, max_num=crawling, file_idx_offset='auto') amount_crawled += crawling end_date += relativedelta(years=-1) print('')
def image_downloader(keyworkds=[],dir_path="dataset",maximages=100): for i, item in enumerate(keyworkds): dir_path_keyword = os.path.join(dir_path,item) #If the directory not exist, create if not os.path.exists(dir_path_keyword): os.makedirs(dir_path_keyword) print('\n\tWriting on directory: ' + str(dir_path_keyword)) print('\tThe images for keyword: ' + str(item) + '\n') google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=2, downloader_threads=4, storage={'root_dir': dir_path_keyword}) filters = dict( type='photo' # size='large', # color='orange', # license='commercial,modify', # date=((2017, 1, 1), (2017, 11, 30)) ) google_crawler.crawl(keyword=item, filters=filters, max_num=maximages, file_idx_offset=0) return(True)
def photo(self, photoDir, WORDS): # Creates instance of Class object GoogleImageCrawler under icrawler > # builtin > google.py file and passes MyImageDowloader class as the # downloader class instead of standard library's ImageDownloader class google_crawler = GoogleImageCrawler( downloader_cls=MyImageDownloader, parser_threads=2, downloader_threads=4, # stores the file where user indicates in script argument storage={'root_dir': photoDir}) # This was the key statement that will assign the name_ of the phone # to the beginning of the filename for the photo downloaded google_crawler.downloader.prefix_name = self.name_ # Set session.verify = False work around for excpetion from requests # found here: https://github.com/hellock/icrawler/issues/40 google_crawler.session.verify = False # Get a random word from the list of WORDS passed word = random.choice(WORDS) # Actual call to crawl method to scrape Google images google_crawler.crawl(keyword=word, max_num=1) # Print location which was passed by script to function print( textwrap.dedent(""" File has been downloaded to: {}""".format(photoDir)))
def exe_crawl(arg): google_crawler = GoogleImageCrawler( downloader_cls=PrefixNameGoogleDownloader, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/google'}) filters = dict(license=f'{arg.license}') google_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max, file_idx_offset=0) bing_crawler = BingImageCrawler( downloader_cls=PrefixNameBingDownloader, downloader_threads=4, storage={'root_dir': f'{arg.dict}/{arg.keyword}/bing'}) bing_crawler.crawl(keyword=f'{arg.keyword}', filters=filters, offset=0, max_num=arg.max) baidu_crawler = BaiduImageCrawler( downloader_cls=PrefixNameBaiduDownloader, storage={'root_dir': f'{arg.dict}/{arg.keyword}/baidu'}) baidu_crawler.crawl(keyword=f'{arg.keyword}', offset=0, max_num=arg.max)
def main(): parser = argparse.ArgumentParser( description='This script downloads image from image.baidu.com.') parser.add_argument( '-k', '--keyword', type=str, nargs='+', help='The keywords of images that users want to download.') parser.add_argument( '-d', '--dir', type=str, help='The dir that is used to save images. Default: ./image.', default='./image') parser.add_argument( '-n', '--num', type=int, help= 'The maximum number of each keyword\'s iamges to be downloaded. It had better to be the times of 60. Default: 10.', default=60) args = parser.parse_args() max_num = args.num save_dir = args.dir keywords = args.keyword for keyword in keywords: print('Start to download the iamges of keyword: ' + keyword) google_storage = {'root_dir': save_dir + '/' + keyword} google_crawler = GoogleImageCrawler(parser_threads=4, downloader_threads=4, storage=google_storage) google_crawler.crawl(keyword=keyword, max_num=max_num)
def getPoliticianImage(): # image path : first argument image_path = sys.argv[1] # excel path : second argument excel_path = sys.argv[2] # max number of image : third argument max_num_image = int(sys.argv[3]) excelFile = xlrd.open_workbook(excel_path) politician_key_value_list = excelFile.sheet_by_index(0) for i in range(0, politician_key_value_list.nrows): # folder name (politician english name) folder_name = politician_key_value_list.cell_value(i, 1) # directory which you can put total_path = image_path + '/' + folder_name google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=8, storage={'root_dir': total_path}) google_crawler.crawl(keyword=politician_key_value_list.cell_value( i, 0), offset=0, max_num=max_num_image, date_min=None, date_max=None, min_size=(200, 200), max_size=None)
def imagecrawl(searchwords, imagenum, title): """ searchwords: 検索クエリ str型. imagenum: 画像の枚数 int型. title: 作品タイトル train/validation 配下のディレクトリを指定 str型. """ # ディレクトリのチェックと作成 train_dir = './train/' + title + '/' #train配下のディレクトリ dircheck(train_dir) valid_dir = './validation/' + title + '/' #validation配下のディレクトリ dircheck(valid_dir) max_idx = max_file_idx(train_dir, valid_dir) # クローラで画像のダウンロード crawler = GoogleImageCrawler(storage={"root_dir": "tmp"}) crawler.crawl(keyword=searchwords, max_num=imagenum, file_idx_offset=max_idx) # ダウンロードしたファイルをtrain, validationに分割し移動 image_list = glob.glob('./tmp/*') random.shuffle(image_list) train_list, valid_list = np.split(np.array(image_list), [int(len(image_list) * 0.8)]) train_list = list(train_list) valid_list = list(valid_list) for i in train_list: shutil.move(i, train_dir) for i in valid_list: shutil.move(i, valid_dir)
def craw_image(key, i): save_dir = '/Users/fanyang/python/finalproject/recipeimage' '''change to savin dir''' crawler = GoogleImageCrawler(feeder_threads=1, parser_threads=2, downloader_threads=4, storage={'root_dir': save_dir + '/' + str(i)}) crawler.crawl(keyword=key, max_num=3)
def send_nudes(entities): """ Sends nudes """ print('--- Running send_nudes') # Get path of current file path = os.getcwd() path = path + '\img' # Choose random keyword keywrd = ['jesus', 'jesus staring', 'jesus wallpaper'] # Google image search for 5 images google_crawler = GoogleImageCrawler(storage={'root_dir': path}) google_crawler.crawl(keyword=rand_choice(keywrd), max_num=3) file_list = os.listdir(path=path) path_list = [] # Choose random image to display for file in file_list: file = path + '\\' + str(file) path_list.append(file) image = rand_choice(path_list) # Display image img = Image.open(image) img.show() # Delete downloaded images for file in path_list: os.remove(file) text_resp = 'SENDING NUDES' return text_resp
def pring(): # new folder name new_path = folder_path.get() + "\\" + folder_name.get() if not os.path.exists(new_path): # make new folder os.mkdir(new_path) # print("create new folder") # main program crawler = GoogleImageCrawler(storage={"root_dir": new_path}) if combo_1.get() != "None": filters = dict(size=combo_1.get()) else: filters = None crawler.crawl(keyword=picture_name.get(), filters=filters, offset=0, max_num=int(picture_num.get())) res = messagebox.askokcancel('finished!!!', 'Reset input, but check folder?') folder_name.delete(0, "end") picture_name.delete(0, "end") picture_num.delete(0, "end") if res: tkinter.filedialog.askopenfilename(initialdir=new_path) else: messagebox.showinfo( 'failed...', 'A folder with the same name already exists\n' + new_path)
def crawl_item(keyword, rootdir, max_num=500, language='vi'): ''' max_num is used at every crawl at different time, so number of crawled image is max_num * len(data-1) ''' global google_crawler storage = {'root_dir': rootdir} print('Starting to crawl {}'.format(keyword)) # change the storage dir google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=4, storage=storage) for i in range(len(date)-1): try: google_crawler.crawl( keyword=keyword, filters={'date': (date[i], date[i+1])}, max_num=max_num, file_idx_offset='auto', language='vi') except Exception as err: print(err) time.sleep(0.5) return
def download_images(keyword, directory, quantity): os.chdir(directory) if keyword not in os.listdir(): os.mkdir(keyword) os.chdir(keyword) google_crawler = GoogleImageCrawler(directory + '/' + keyword) google_crawler.crawl( keyword=keyword, offset=0, max_num=quantity, date_min=None, date_max=None, feeder_thr_num=1, parser_thr_num=1, downloader_thr_num=4, #min_size=(200,200), max_size=None) min_size=None, max_size=None) # adding keyword in the beginning of filename command = 'perl-rename \'s/(.*)/%s_$1/\' *' % (keyword) os.system(command)
def my_crawl(name): ''' uses Google Image Crawler to crawl google image and download, according to given keyword :param name: :return: ''' class PrefixNameDownloader(ImageDownloader): def get_filename(self, task, default_ext): filename = super(PrefixNameDownloader, self).get_filename(task, default_ext) return name + filename google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=2, downloader_cls=PrefixNameDownloader, downloader_threads=4, storage={ 'root_dir': '/Volumes/USB STICK/image database/images/google3' }) filters = dict(size='=512x512', license='commercial,modify', date=((2017, 1, 1), (2017, 11, 30))) google_crawler.crawl(keyword=name + 'filetype: jpg', filters=filters, max_num=500, file_idx_offset=0)
def crawl( folder: str, search: str, maxnum: int, crawlers: [List[str]] = ['GOOGLE', 'BING', 'BAIDU']) -> Dict[str, str]: """Crawl web sites for images""" print('(1) Crawling ...') # prepare folders os.makedirs(folder, exist_ok=True) sources = {} if maxnum > 1000: print("Max num limited to 1000") maxnum = 1000 for c in crawlers: print(f' -> {c}') if c == 'GOOGLE': google_crawler = GoogleImageCrawler( downloader_cls=CustomDownloader, log_level=logging.CRITICAL, feeder_threads=1, parser_threads=1, downloader_threads=4, storage={'root_dir': folder}) google_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset=0) if c == 'BING': bing_crawler = BingImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, downloader_threads=4, storage={'root_dir': folder}) bing_crawler.crawl(keyword=search, filters=None, offset=0, max_num=maxnum, file_idx_offset='auto') if c == 'BAIDU': baidu_crawler = BaiduImageCrawler(downloader_cls=CustomDownloader, log_level=logging.CRITICAL, storage={'root_dir': folder}) baidu_crawler.crawl(keyword=search, offset=0, max_num=maxnum, min_size=(200, 200), max_size=None, file_idx_offset='auto') return { k: v for k, v in CustomDownloader.registry.items() if k is not None }
def test_google(logo): google_crawler = GoogleImageCrawler( downloader_cls=MyImageDownloader, downloader_threads=4, storage={'root_dir': os.path.join(root, logo, 'google')}, log_level=logging.INFO, filename=os.path.join(root, logo, 'google.txt')) google_crawler.crawl(logo, max_num=args.maxnum)
def crawl_images(image_dir, concept_keyword, N=10): google_crawler = GoogleImageCrawler( storage={'root_dir': os.path.join(image_dir, concept_keyword+"_before")}, feeder_threads=1, parser_threads=2, downloader_threads=4, ) google_crawler.crawl(keyword=concept_keyword, max_num=N)
def test_google(): google_crawler = GoogleImageCrawler(downloader_threads=4, storage={'root_dir': 'images/google'}, log_level=logging.INFO) google_crawler.crawl('tesla', max_num=10, date_min=date(2016, 2, 1), date_max=date(2016, 3, 15))
def CrawlByName(name,numPictures,savedir): #Crawls numPictures amount of images from Goggle images with given search term print('Start Crawling...') capture = StringIO() sys.stderr = capture google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': str(savedir)}) google_crawler.crawl(keyword=str(name),max_num=numPictures,date_min=None, date_max=None) return(capture.getvalue())#Return output with image links (needed later for crossvalidation)
def getGoogleImage(keyword, dir, max): google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=10, storage={'root_dir': dir}) google_crawler.crawl(keyword=keyword, offset=0, max_num=max, min_size=(100, 100), max_size=None)
def download_image(fish_name, max_num): google_crawler = GoogleImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=54, storage={'root_dir': 'fish_image/'+fish_name}) google_crawler.crawl(keyword=fish_name, filters=None, offset=0, max_num=max_num, min_size=None, max_size=None, file_idx_offset=0)
def test_google(): print('start testing GoogleImageCrawler') google_crawler = GoogleImageCrawler( downloader_threads=4, storage={'root_dir': 'images/google'}, log_level=logging.INFO) search_filters = dict( size='large', color='orange', license='commercial,modify', date=(None, (2017, 11, 30))) google_crawler.crawl('cat', filters=search_filters, max_num=10)
def test_google(): img_dir = osp.join(test_dir, 'google') google_crawler = GoogleImageCrawler( downloader_threads=2, storage={'root_dir': img_dir}, log_level=logging.INFO) search_filters = dict( size='large', color='orange', license='commercial,modify', date=(None, (2017, 11, 30))) google_crawler.crawl('cat', filters=search_filters, max_num=5) shutil.rmtree(img_dir)
def getImg(keywords='', dirpath='', amount=0, source=4): if source == 1: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) elif source == 2: print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) elif source == 3: print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) else: print('\n--- 開始從「Google 圖片」下載---\n') google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4,storage={'root_dir': dirpath}) google_crawler.crawl(keyword=keywords, offset=0, max_num=amount, date_min=None, date_max=None, min_size=(200,200), max_size=None) print('\n--- 開始從「Microsoft Bing」下載---\n') bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': dirpath}) bing_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None) print('\n--- 開始從「百度」下載---\n') baidu_crawler = BaiduImageCrawler(storage={'root_dir': dirpath}) baidu_crawler.crawl(keyword=keywords, offset=0, max_num=amount, min_size=None, max_size=None)
from icrawler.builtin import BaiduImageCrawler, BingImageCrawler, GoogleImageCrawler google_crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=4, storage={'root_dir': 'C:/Users/Saurabh/Desktop/'}) google_crawler.crawl(keyword='sandwich', offset=0, max_num=100, date_min=None, date_max=None, min_size=(200,200), max_size=None)