Beispiel #1
0
    def search(self):
        FLICKR_LINK = 'https://www.flickr.com/services/rest/'

        #headers = {"Ocp-Apim-Subscription-Key" : self.api_key}
        data = self.data.replace(" ", "+")

        if data[0] == "+":
            data = data[1:]

        params = {
            "method": "flickr.photos.search",
            "api_key": self.api_key,
            "tags": data,
            "format": "json",
            "page": self.page,
            "nojsoncallback": 1
        }
        with Progress() as progress:
            task1 = progress.add_task(
                f"Downloading [blue]{self.data}[/blue] class...",
                total=self.n_images)
            while self.downloaded_images < self.n_images:
                response = requests.get(FLICKR_LINK, params=params)
                response.raise_for_status()
                results = response.json()
                results = results['photos']
                if results['total'] == 0:
                    progress.update(task1, advance=self.n_images)
                    return 0

                self.page += 1

                if not os.path.exists(self.root_folder):
                    os.mkdir(self.root_folder)

                target_folder = os.path.join(self.root_folder, self.folder)
                if not os.path.exists(target_folder):
                    os.mkdir(target_folder)

                for num, result in enumerate(results['photo']):
                    try:
                        if self.downloaded_images < self.n_images:
                            link = f"https://farm{result['farm']}.staticflickr.com/{result['server']}/{result['id']}_{result['secret']}.jpg"
                            download(link, num, self.size, self.root_folder,
                                     self.folder)
                            self.downloaded_images += 1
                            progress.update(task1, advance=1)
                        else:
                            break
                    except:
                        continue
Beispiel #2
0
    def search(self):
        BING_IMAGE = 'https://www.bing.com/images/async?q='

        USER_AGENT = {
            'User-Agent':
            'Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'
        }

        data = self.data.replace(" ", "-")

        if data[0] == "-":
            data = data[1:]

        page_counter = 0
        with Progress() as progress:
            task1 = progress.add_task(
                f"Downloading [blue]{self.data}[/blue] class...",
                total=self.n_images)
            while self.downloaded_images < self.n_images:
                searchurl = BING_IMAGE + data + '&first=' + str(
                    self.page) + '&count=100'

                # request url, without usr_agent the permission gets denied
                response = requests.get(searchurl, headers=USER_AGENT)
                html = response.text
                self.page += 100
                results = re.findall('murl&quot;:&quot;(.*?)&quot;', html)

                if not os.path.exists(self.root_folder):
                    os.mkdir(self.root_folder)

                target_folder = os.path.join(self.root_folder, self.folder)
                if not os.path.exists(target_folder):
                    os.mkdir(target_folder)

                for link in results:
                    try:
                        if self.downloaded_images < self.n_images:
                            download(link, self.size, self.root_folder,
                                     self.folder, self.resize_method)
                            self.downloaded_images += 1
                            progress.update(task1, advance=1)
                        else:
                            break
                    except:
                        continue
                self.downloaded_images -= erase_duplicates(target_folder)
        print('Done')
Beispiel #3
0
	def search(self):
		BING_IMAGE = 'https://api.cognitive.microsoft.com/bing/v7.0/images/search'

		headers = {"Ocp-Apim-Subscription-Key" : self.api_key}
		params  = {"q": self.data, "count": 100, "offset": self.page}

		page_counter = 0
		with Progress() as progress:
			task1 = progress.add_task(f"Downloading [blue]{self.data}[/blue] class...",total=self.n_images)
			while self.downloaded_images < self.n_images:
				response = requests.get(BING_IMAGE, headers=headers, params=params)
				response.raise_for_status()
				results = response.json()
				self.page += 100

				if not os.path.exists(self.root_folder):
					os.mkdir(self.root_folder)

				target_folder = os.path.join(self.root_folder, self.folder)
				if not os.path.exists(target_folder):
					os.mkdir(target_folder)

				for result in results['value']:
					try:
						if self.downloaded_images < self.n_images:
							download(result['contentUrl'],self.size,self.root_folder,self.folder, self.resize_method)
							self.dataset_info.append({
								'name': result['name'],
								'origin': result['hostPageDisplayUrl'].split('/')[2],
								'date': result['datePublished'],
								'original_size': result['contentSize'],
								'original_width': result['width'],
								'original_height' : result['height']})

							self.downloaded_images += 1
							progress.update(task1, advance=1)
						else:
							break; 
					except:
						continue
			self.downloaded_images -= erase_duplicates(target_folder)
		generate_class_info(self.dataset_info,self.root_folder, self.folder)
Beispiel #4
0
    def search(self):
        URL = 'https://duckduckgo.com/'
        PARAMS = {'q': self.data}
        HEADERS = {
            'authority': 'duckduckgo.com',
            'accept': 'application/json, text/javascript, */*; q=0.01',
            'sec-fetch-dest': 'empty',
            'x-requested-with': 'XMLHttpRequest',
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'cors',
            'referer': 'https://duckduckgo.com/',
            'accept-language': 'en-US,en;q=0.9'
        }

        res = requests.post(URL, data=PARAMS, timeout=3.000)
        search_object = re.search(r'vqd=([\d-]+)\&', res.text, re.M | re.I)
        #print(search_object)

        if not search_object:
            return -1

        PARAMS = (('l', 'us-en'), ('o', 'json'), ('q', self.data),
                  ('vqd', search_object.group(1)), ('f', ',,,'), ('p', '1'),
                  ('v7exp', 'a'))

        request_url = URL + "i.js"
        with Progress() as progress:

            task1 = progress.add_task(
                "[blue]Downloading {x} class...".format(x=self.data),
                total=self.n_images)
            while self.downloaded_images < self.n_images:
                while True:
                    try:
                        res = requests.get(request_url,
                                           headers=HEADERS,
                                           params=PARAMS,
                                           timeout=3.000)
                        data = json.loads(res.text)
                        break
                    except ValueError as e:
                        time.sleep(5)
                        continue

                if not os.path.exists(self.root_folder):
                    os.mkdir(self.root_folder)

                target_folder = os.path.join(self.root_folder, self.folder)
                if not os.path.exists(target_folder):
                    os.mkdir(target_folder)

                # Cut the extra result by the amount that still need to be downloaded
                if len(data["results"]
                       ) > self.n_images - self.downloaded_images:
                    data["results"] = data["results"][:self.n_images -
                                                      self.downloaded_images]

                for num, results in enumerate(data["results"]):
                    try:
                        download(results["image"], num, self.size,
                                 self.root_folder, self.folder)
                        self.downloaded_images += 1
                        progress.update(task1, advance=1)
                    except Exception as e:
                        continue
                if "next" not in data:
                    return 0
                request_url = URL + data["next"]