def _fetch(self): response = make_request( self.url, "GET", useragent= "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; rv:11.0)" ) with response as file: text = decode_all(file) html = BeautifulSoup(text, features="html.parser") #data = [(e.parent.parent["class"], e) for e in html.select("div[data-image-url]")] # #first = data[0] #if "AdaptiveMedia-doublePhoto" in first[0]: amount = 2 #elif "AdaptiveMedia-triplePhoto" in first[0]: amount = 3 #elif "AdaptiveMedia-quadPhoto" in first[0]: amount = 4 #else: amount = 1 # TODO Check if format was changed # #self.src.append(first[1]["data-image-url"]) #if amount > 1: # self.meta["additional"] = [e[1]["data-image-url"] for i, e in enumerate(data) if 0 < i < amount] index = int(self._regm.group("photo_nr") or 1) - 1 data = html.select("div[tabindex='0']")[0].select( "div[data-image-url]")[index] self.src.append(data["data-image-url"])
def fetch(self, update_sources=True, probe=False, ratelimit_retry=False): try: if update_sources and probe: make_request(self.url, 'HEAD', ratelimit_retry) if update_sources: self._fetch() except urllib.error.HTTPError as e: self.status = e.code except urllib.error.URLError as e: self.status = 400 except ssl.CertificateError: self.status = 495 except Exception as e: self.exception = e self.status = 400 if type(self) == Other: self.status == 200
def _fetch(self): response = make_request(self.url, "GET") with response as file: text = decode_all(file) html = BeautifulSoup(text, features="html.parser") data = html.select("a.thumb_image")[0] url = "http://e-shuushuu.net" + data["href"] self.src.append(url)
def _fetch(self): response = make_request("https://www.zerochan.net/full/" + self._regm.group("id"), "GET") # TODO Can't access nsfw pictures with response as file: text = decode_all(file) html = BeautifulSoup(text, features="html.parser") data = html.select('img[alt*="Tags"]')[0] self.src.append(data["src"])
def _fetch(self): url = "https://safebooru.org/index.php?page=dapi&s=post&q=index&limit=1&id=" + self._regm.group( "id") response = make_request(url, "GET") with response as file: text = decode_all(file) data = minidom.parseString(text).childNodes[0].childNodes[0] self.src.append("http:" + data.getAttribute("file_url")) source = data.getAttribute("source") if source and validators.url(source): self.src.append(source) rating = data.getAttribute("rating") if rating: self.meta["rating"] = rating
def _fetch(self): response = make_request( "https://danbooru.donmai.us/posts/" + self._regm["id"] + ".json", "GET") with response as file: text = decode_all(file) data = json.loads(text) if "tag_string_artist" in data: self.meta["author"] = data["tag_string_artist"] if "tag_string_character" in data: characters = data["tag_string_character"].strip() if characters: self.meta["character"] = characters.split(" ") self.meta["rating"] = data["rating"] self.meta["uid"] = data["md5"] self.src.append(data["file_url"]) if "source" in data: dsrc = data["source"] if validators.url(dsrc): self.src.append(dsrc)
def download_img(source): img = source.get("img", None) meta = source.get("meta", {}) uid = meta.get("uid", None) if not img: log("No image supplied for\n" + str(source)) return if not uid: log("No uid for\n" + str(source)) try: url = urlparse(img) filename = os.path.basename(url.path) _, ext = os.path.splitext(filename) tmpfile = Path(ARGS.out_folder) / (filename + ".tmp") imgfile = Path(ARGS.out_folder) / (str(UUID(uid)) + ext) response = make_request(img, "GET") header = response.info() cnt_type = header["Content-Type"] if cnt_type not in ("image/jpeg", "image/png"): log("Unknown content type", cnt_type, "for", img) return size = int(header["Content-Length"]) size_mb = size / 1_000_000 if size_mb > ARGS.max_filesize: log("%s is too big! You specified a maximum size of %d MB, file is %.2f MB" % (img, ARGS.max_filesize, size_mb)) return total_chunks = math.ceil(size / config.download_chunk_size) with response as stream: with open(tmpfile, "wb") as outf: log("Starting download of", img) for _ in atpbar.atpbar(range(total_chunks), name=img): #for _ in range(total_chunks): chunk = stream.read(config.download_chunk_size) if not chunk: break outf.write(chunk) os.rename(tmpfile, imgfile) xmpfile = imgfile.with_suffix(".xmp") cute_meta = CuteMeta.from_file(xmpfile) cute_meta.clear() # Delete all unwanted tags log("Generating image hash for file", imgfile) cute_meta.hash = hash_img(imgfile) log("Hashed", imgfile, "as", cute_meta.hash, "(phash, 16)") cute_meta.read_from_dict(meta, ignore_missing_keys=True) cute_meta.add_characters(*meta.get("character", [])) cute_meta.source = img cute_meta.source_other = source.get("src", []) cute_meta.source_via = source.get("via", []) cute_meta.date = datetime.utcnow() cute_meta.write() except urllib.error.HTTPError as e: status = e.code except urllib.error.URLError as e: status = 400 except ssl.CertificateError: status = 495 except Exception as e: log("An exception occured while fetching url %s: %s" % (img, str(e))) status = 0 else: status = 200 if status and status != 200: log("%s: %s" % (status, img))