def search(self): FLICKR_LINK = 'https://www.flickr.com/services/rest/' #headers = {"Ocp-Apim-Subscription-Key" : self.api_key} data = self.data.replace(" ", "+") if data[0] == "+": data = data[1:] params = { "method": "flickr.photos.search", "api_key": self.api_key, "tags": data, "format": "json", "page": self.page, "nojsoncallback": 1 } with Progress() as progress: task1 = progress.add_task( f"Downloading [blue]{self.data}[/blue] class...", total=self.n_images) while self.downloaded_images < self.n_images: response = requests.get(FLICKR_LINK, params=params) response.raise_for_status() results = response.json() results = results['photos'] if results['total'] == 0: progress.update(task1, advance=self.n_images) return 0 self.page += 1 if not os.path.exists(self.root_folder): os.mkdir(self.root_folder) target_folder = os.path.join(self.root_folder, self.folder) if not os.path.exists(target_folder): os.mkdir(target_folder) for num, result in enumerate(results['photo']): try: if self.downloaded_images < self.n_images: link = f"https://farm{result['farm']}.staticflickr.com/{result['server']}/{result['id']}_{result['secret']}.jpg" download(link, num, self.size, self.root_folder, self.folder) self.downloaded_images += 1 progress.update(task1, advance=1) else: break except: continue
def search(self): BING_IMAGE = 'https://www.bing.com/images/async?q=' USER_AGENT = { 'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0' } data = self.data.replace(" ", "-") if data[0] == "-": data = data[1:] page_counter = 0 with Progress() as progress: task1 = progress.add_task( f"Downloading [blue]{self.data}[/blue] class...", total=self.n_images) while self.downloaded_images < self.n_images: searchurl = BING_IMAGE + data + '&first=' + str( self.page) + '&count=100' # request url, without usr_agent the permission gets denied response = requests.get(searchurl, headers=USER_AGENT) html = response.text self.page += 100 results = re.findall('murl":"(.*?)"', html) if not os.path.exists(self.root_folder): os.mkdir(self.root_folder) target_folder = os.path.join(self.root_folder, self.folder) if not os.path.exists(target_folder): os.mkdir(target_folder) for link in results: try: if self.downloaded_images < self.n_images: download(link, self.size, self.root_folder, self.folder, self.resize_method) self.downloaded_images += 1 progress.update(task1, advance=1) else: break except: continue self.downloaded_images -= erase_duplicates(target_folder) print('Done')
def search(self): BING_IMAGE = 'https://api.cognitive.microsoft.com/bing/v7.0/images/search' headers = {"Ocp-Apim-Subscription-Key" : self.api_key} params = {"q": self.data, "count": 100, "offset": self.page} page_counter = 0 with Progress() as progress: task1 = progress.add_task(f"Downloading [blue]{self.data}[/blue] class...",total=self.n_images) while self.downloaded_images < self.n_images: response = requests.get(BING_IMAGE, headers=headers, params=params) response.raise_for_status() results = response.json() self.page += 100 if not os.path.exists(self.root_folder): os.mkdir(self.root_folder) target_folder = os.path.join(self.root_folder, self.folder) if not os.path.exists(target_folder): os.mkdir(target_folder) for result in results['value']: try: if self.downloaded_images < self.n_images: download(result['contentUrl'],self.size,self.root_folder,self.folder, self.resize_method) self.dataset_info.append({ 'name': result['name'], 'origin': result['hostPageDisplayUrl'].split('/')[2], 'date': result['datePublished'], 'original_size': result['contentSize'], 'original_width': result['width'], 'original_height' : result['height']}) self.downloaded_images += 1 progress.update(task1, advance=1) else: break; except: continue self.downloaded_images -= erase_duplicates(target_folder) generate_class_info(self.dataset_info,self.root_folder, self.folder)
def search(self): URL = 'https://duckduckgo.com/' PARAMS = {'q': self.data} HEADERS = { 'authority': 'duckduckgo.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'sec-fetch-dest': 'empty', 'x-requested-with': 'XMLHttpRequest', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'sec-fetch-site': 'same-origin', 'sec-fetch-mode': 'cors', 'referer': 'https://duckduckgo.com/', 'accept-language': 'en-US,en;q=0.9' } res = requests.post(URL, data=PARAMS, timeout=3.000) search_object = re.search(r'vqd=([\d-]+)\&', res.text, re.M | re.I) #print(search_object) if not search_object: return -1 PARAMS = (('l', 'us-en'), ('o', 'json'), ('q', self.data), ('vqd', search_object.group(1)), ('f', ',,,'), ('p', '1'), ('v7exp', 'a')) request_url = URL + "i.js" with Progress() as progress: task1 = progress.add_task( "[blue]Downloading {x} class...".format(x=self.data), total=self.n_images) while self.downloaded_images < self.n_images: while True: try: res = requests.get(request_url, headers=HEADERS, params=PARAMS, timeout=3.000) data = json.loads(res.text) break except ValueError as e: time.sleep(5) continue if not os.path.exists(self.root_folder): os.mkdir(self.root_folder) target_folder = os.path.join(self.root_folder, self.folder) if not os.path.exists(target_folder): os.mkdir(target_folder) # Cut the extra result by the amount that still need to be downloaded if len(data["results"] ) > self.n_images - self.downloaded_images: data["results"] = data["results"][:self.n_images - self.downloaded_images] for num, results in enumerate(data["results"]): try: download(results["image"], num, self.size, self.root_folder, self.folder) self.downloaded_images += 1 progress.update(task1, advance=1) except Exception as e: continue if "next" not in data: return 0 request_url = URL + data["next"]