def Kuvat(self): kuvat = [] images = self.soup.find_all("img") for image in images: x = image.get("alt") if x is None or not "strip" in x.lower(): continue kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"])) else: kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] div = self.soup.find("div", { "class": "comic_group" }) images = div.find_all("img") for image in images: kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"].strip())) else: uu = "/".join(self.urli.split("/")[:-1]) kuva["src"] = url_fix("{}/{}".format(uu, image["src"].strip())) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def test_url_fixing(): """URL fixing""" x = url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' x = url_fix('http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f'
def Kuvat(self): kuvat = [] #mages = self.soup.find_all("img", { "class": "strip" }) #for image in images: image = self.soup.find(id="comicimg") if image: kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) if not "://" in kuva["src"]: kuva["src"] = url_fix( "{}/{}".format(self.sarjakuva.url, image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def wiki_image(self, addr, alt, class_='wiki', lineno=0): """Create HTML for a wiki image.""" addr = addr.strip() chunk = '' if hatta.parser.external_link(addr): return html.img(src=url_fix(addr), class_="external", alt=alt) if '#' in addr: addr, chunk = addr.split('#', 1) if addr == '': return html.a(name=chunk) elif addr.startswith(':'): if chunk: chunk = '#' + chunk alias = self.link_alias(addr[1:]) href = url_fix(alias + chunk) return html.img(src=href, class_="external alias", alt=alt) elif addr in self.storage: mime = page_mime(addr) if mime.startswith('image/'): return html.img(src=self.get_download_url(addr), class_=class_, alt=alt) else: return html.img(href=self.get_download_url(addr), alt=alt) else: return html.a(html(alt), href=self.get_url(addr))
def Kuvat(self): kuvat = [] div = self.soup.find("noscript") image = div.find("img") kuva = dict(nimi=None, src=None, filetype="png") try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = image["src"].split("?")[0] image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) image["src"] = "{}".format(image["src"].replace("_500.", "_1280.")) kuva["nimi"] = "{}.{}".format(image["src"].split("/")[-1], kuva["filetype"]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix("{}".format(image["src"])) if not "://" in image["src"]: kuva["src"] = url_fix( "{}{}".format(self.sarjakuva.url, image["src"]) ) #kuva["filetype"] = u"{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] #div = self.soup.find("div", { "class": "comic-content"}) #images = div.find_all("img") #for image in images: table = self.soup.find(id="comic") if table is None: table = self.soup.find("table", { "class": "shadow"} ) images = table.find_all("img") for image in images: kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"])) else: kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] #div = self.soup.find("div", { "class": "comic-content"}) #images = div.find_all("img") #for image in images: content = self.soup.find("div", { "class": "content"}) section = content.find("section") entry = section.find("div", { "class": "entry"}) image = entry.find("img") kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"])) else: kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] div = self.soup.find(id="cc") images = div.find_all("img") for image in images: kuva = dict(nimi=None, src=None, filetype=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"])) else: kuva["src"] = url_fix("{}/{}".format(self.sarjakuva.url, image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def test_url_fixing(self): x = urls.url_fix( u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' x = urls.url_fix('http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f'
def test_url_fixing(self): x = urls.url_fix( u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') self.assert_line_equal( x, 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)') x = urls.url_fix("http://just.a.test/$-_.+!*'(),") self.assert_equal(x, "http://just.a.test/$-_.+!*'(),")
def test_url_fixing_qs(): x = urls.url_fix(b'http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f' x = urls.url_fix('http://acronyms.thefreedictionary.com/' 'Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation') assert x == ('http://acronyms.thefreedictionary.com/' 'Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation')
def test_url_fixing_qs(): x = urls.url_fix(b'http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f' x = urls.url_fix( 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation' ) assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
def test_url_fixing(self): x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)' x = urls.url_fix("http://just.a.test/$-_.+!*'(),") assert x == "http://just.a.test/$-_.+!*'()," x = urls.url_fix('http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f'
def test_url_fixing(self): x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' x = urls.url_fix('http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f' x = urls.url_fix('http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation') assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
def test_url_fixing(): x = urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)' x = urls.url_fix("http://just.a.test/$-_.+!*'(),") assert x == "http://just.a.test/$-_.+!*'()," x = urls.url_fix('http://höhöhö.at/höhöhö/hähähä') assert x == r'http://xn--hhh-snabb.at/h%C3%B6h%C3%B6h%C3%B6/h%C3%A4h%C3%A4h%C3%A4'
def test_url_fixing_qs(self): x = urls.url_fix(b'http://example.com/?foo=%2f%2f') self.assert_line_equal(x, 'http://example.com/?foo=%2f%2f') x = urls.url_fix( 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation' ) self.assert_equal( x, 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation' )
def test_url_fixing(self): x = urls.url_fix( u'http://de.wikipedia.org/wiki/Elf (Begriffskl\xe4rung)') assert x == 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' x = urls.url_fix('http://example.com/?foo=%2f%2f') assert x == 'http://example.com/?foo=%2f%2f' x = urls.url_fix( 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation' ) assert x == 'http://acronyms.thefreedictionary.com/Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation'
def test_url_fixing_qs(): x = urls.url_fix(b"http://example.com/?foo=%2f%2f") assert x == "http://example.com/?foo=%2f%2f" x = urls.url_fix( "http://acronyms.thefreedictionary.com/" "Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation" ) assert x == ( "http://acronyms.thefreedictionary.com/" "Algebraic+Methods+of+Solving+the+Schr%C3%B6dinger+Equation" )
def Kuvat(self): kuvat = [] #div = self.soup.find("div", { "class": "comic-content"}) #images = div.find_all("img") #for image in images: # image = self.soup.find("img", { "class": "alignnone"}) # if image is None: # image = self.soup.find("img", { "class": "aligncenter"}) images = self.soup.find_all("img") for image in images: try: width = image.get("width") if int(width) < 400: continue except Exception as e: #print e continue kuva = dict(nimi=None, src=None, filetype=None) try: image["src"] = image["src"].split("?")[0] if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi if "data:image" in image["src"]: kuva["src"] = image["src"] elif "://" in image["src"]: kuva["src"] = url_fix("{}".format(image["src"])) else: kuva["src"] = url_fix("{}{}".format(self.sarjakuva.url, image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def coerce_url(url: str) -> str: """ Coerce URL to valid format :param url: URL :return: str """ url.strip() if url.startswith("is_feed://"): return url_fix("http://{0}".format(url[7:])) for proto in ["http://", "https://"]: if url.startswith(proto): return url_fix(url) return url_fix("http://{0}".format(url))
def _write(self, path, iname=None, data=None, replace=False, url=None, type=type, **kwargs): if not (iname or data or url): raise Exeption('Either iname, data or url need to be passed') data = dumps(data) if isinstance(data, dict) or isinstance( data, list) else data data = data.encode('utf-8') if isinstance(data, str) else data if url and type == 'Reference': kwargs.update({'replace': replace, 'url': url, 'type': type}) r = self._put(url_fix(urljoin(self.url, path)), params=kwargs, data='') else: with TemporaryFile(mode='wb+') as f, open( iname, mode='rb') if iname else BytesIO( data) if data else htopen(url, mode='rb') as i: hasher = sha256() b = None while b == None or b != b'': b = i.read(100 * 1024) f.write(b) hasher.update(b) f.seek(0) r = self._put(url_fix(urljoin(self.url, path)), params={ 'replace': replace, 'expected_hash': 'SHA256:' + hasher.digest().hex(), 'url': url, 'type': type }, files={path: f}) if r.status_code not in [200, 204]: raise Exception('%d %s' % (r.status_code, r.text))
def Kuvat(self): kuvan_nimi = None src = None kuvat = [] centers = self.soup.find_all("center") for i in centers: center = i.find("center") image = i.find("img") br = i.find("br") if center is None and image and br: kuva = dict(nimi=None, src=None) image["src"] = image["src"].replace("\n", "") if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] ul = self.soup.find("ul", {"class":"latest-blog-posts-list"}) images = ul.find_all("img") for image in images: if not "comic" in image["src"]: continue kuva = dict(nimi=None, src=None) image["src"] = image["src"].split("?")[0] try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) image["src"] = "{}".format(image["src"].replace("_500.", "_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix("{}".format(image["src"])) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) found = self.sessio.query(Strippi).filter( Strippi.sarjakuva_id == self.sarjakuva.id, Strippi.url == image["src"] ).first() if not found: kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] div = self.soup.find("div", { "class": "comic" }) image = div.find("img") kuva = dict(nimi=None, src=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) image["src"] = "{}".format(image["src"].replace("_500.", "_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}/{}".format(self.sarjakuva.url, image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] #mages = self.soup.find_all("img", { "class": "strip" }) #for image in images: div = self.soup.find("noscript") image = div.find("img") kuva = dict(nimi=None, src=None, filetype=None) try: image["src"] = image["src"].split("?")[0] if image["src"][-1] == "/": image["src"] = image["src"][:-1] kuva["filetype"] = "jpg" if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) if kuva["filetype"] is None: kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] #div = self.soup.find("div", { "class": "comic-content"}) #images = div.find_all("img") #for image in images: image = self.soup.find("img", { "class": "img-comic" }) kuva = dict(nimi=None, src=None, filetype="jpg") try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) kuva["nimi"] = "{}.{}".format(image["src"].split("/")[-1], kuva["filetype"]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) #kuva["filetype"] = u"{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def fix_target(self): if isinstance(self.target.data, str): self.target.data = self.target.data.strip() pre, sep, _ = self.target.data.partition("//") if not sep: self.target.data = f"http://{pre}" self.target.data = url_fix(self.target.data)
def parse_product(self, response): soup = BeautifulSoup(response.body, 'lxml') p = Product() for element, path in self.selectors.viewitems(): node = soup.select_one(path) if not node: continue if element == 'image': p[element] = url_fix(urljoin(response.url, node['src'])) else: p[element] = text(node) if 'name' in p and 'number' in p: p['url'] = response.url p['pricing'], p['discountcode'] = get_prices(soup) soup.decompose() yield p else: # Only follow links on non-product pages soup.decompose() for link in self.link_extractor.extract_links(response): yield Request(url=link.url)
def Kuvat(self): kuvat = [] div = self.soup.find(id="wsite-content") figures = div.find_all("div", {"class": "wsite-image"}) for figure in figures: images = figure.find_all("img") for image in images: kuva = dict(nimi=None, src=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = image["src"].split("?")[0] image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) image["src"] = "{}".format(image["src"].replace("_500.", "_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}{}".format(self.sarjakuva.url, image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def __init__(self, content): self.content = content.encode('ascii', 'ignore') # make safe, but retain spaces self._words = wz.url_fix(self.content.lower()).replace('%20', ' ').split(' ') self._body_normalized = ' ' + ' '.join(self._words) + ' '
def Kuvat(self): kuvat = [] div = self.soup.find("div", { "class": "comic-content"}) images = div.find_all("img") for image in images: kuva = dict(nimi=None, src=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}{}".format("http://www.interrobangstudios.com/", image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def _validate_url(url: str) -> str: '''Validate a URL. Given a string, return a sanitized URL, or raise InvalidURLError if the string is not a valid URL. Args: url (str): The string to validate as a URL Returns: str: The sanitized, validated URL Raises: InvalidURLError: The argument is not a valid URL ''' if not url or not isinstance(url, str): raise InvalidURLError # KISS. Can be expanded later if desired. valid_schemes = ['http', 'https'] valid_netloc_pattern = re.compile(r'\w+\.\w+') url_tuple = url_parse(url, scheme='http') scheme, netloc, path = url_tuple.scheme, url_tuple.netloc, url_tuple.path if scheme not in valid_schemes: raise InvalidURLError if not re.match(valid_netloc_pattern, netloc) and \ (netloc or not re.match(valid_netloc_pattern, path)): raise InvalidURLError return url_fix(url)
def Kuvat(self): kuvat = [] articles = self.soup.find_all("article") for article in articles: figures = article.find_all("figure") for figure in figures: images = figure.find_all("img") for image in images: kuva = dict(nimi=None, src=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] # table = self.soup.find("table") # table = table.find("tbody") # table = self.soup.find("tr").find("td") found = self.soup.find_all("img") for image in found: try: if image["src"].index("images/") == 0: kuva = dict(nimi=None, src=None) # if image["src"].index("//") == 0: # image["src"] = u"http:{}".format(image["src"]) #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}{}".format(self.sarjakuva.url, image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) break except Exception as e: pass return kuvat
def Kuvat(self): kuvat = [] div = self.soup.find(id="content") article = div.find("article") divs = article.find_all("div", {"class": "entry-content"}) for div in divs: images = div.find_all("img") for image in images: kuva = dict(nimi=None, src=None) try: if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) except: pass try: if image["src"].index("./") == 0: image["src"] = image["src"].replace("./", "/") except: pass image["src"] = "{}".format(image["src"].replace("_250.", "_1280.")) image["src"] = "{}".format(image["src"].replace("_500.", "_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def Kuvat(self): kuvat = [] # articles = self.soup.find_all("article") # for article in articles: # figures = article.find_all("figure") # for figure in figures: # images = figure.find_all("img") # #for image in images: kuva = dict(nimi=None, src=None) div = self.soup.find(id="comic-page") if div is None: div = self.soup.find(id="comic") if div is None: div = self.soup.find("div", { "class": "comic" }) images = div.find_all("img") for image in images: if "?" in image["src"]: image["src"] = image["src"].split("?")[0] if image["src"].index("//") == 0: image["src"] = "http:{}".format(image["src"]) #image["src"] = u"{}".format(image["src"].replace(u"_250.", u"_1280.")) kuva["nimi"] = "{}".format(image["src"].split("/")[-1]) # kuvan nimi = tiedoston nimi kuva["src"] = url_fix( "{}".format(image["src"]) ) kuva["filetype"] = "{}".format(image["src"].split(".")[-1]) kuvat.append(kuva) return kuvat
def __init__(self, content): self.content = content # make safe, but retain spaces self._words = wz.url_fix(content).replace('%20', ' ').split(' ') self._body_normalized = ' ' + ' '.join(self._words) + ' '
def test_quoting(self): self.assert_strict_equal(urls.url_quote(u'\xf6\xe4\xfc'), '%C3%B6%C3%A4%C3%BC') self.assert_strict_equal(urls.url_unquote(urls.url_quote(u'#%="\xf6')), u'#%="\xf6') self.assert_strict_equal(urls.url_quote_plus('foo bar'), 'foo+bar') self.assert_strict_equal(urls.url_unquote_plus('foo+bar'), u'foo bar') self.assert_strict_equal(urls.url_quote_plus('foo+bar'), 'foo%2Bbar') self.assert_strict_equal(urls.url_unquote_plus('foo%2Bbar'), u'foo+bar') self.assert_strict_equal( urls.url_encode({ b'a': None, b'b': b'foo bar' }), 'b=foo+bar') self.assert_strict_equal( urls.url_encode({ u'a': None, u'b': u'foo bar' }), 'b=foo+bar') self.assert_strict_equal( urls.url_fix( u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)'), 'http://de.wikipedia.org/wiki/Elf%20(Begriffskl%C3%A4rung)') self.assert_strict_equal(urls.url_quote_plus(42), '42') self.assert_strict_equal(urls.url_quote(b'\xff'), '%FF')
def to_python(self, value): "Normalize data to a list of strings." # Return an empty list if no input was given. if not value: return [] return [url_fix(x.strip()) for x in value.split("\n") if x.strip()]
def Loop(self, url=None, sessio=db.session): self.sessio = sessio self.Init(url) kuvat = ["jpg", "jpeg", "gif", "png", "svg"] links = self.soup.find_all("a") count = 0 loaded = sessio.query(Strippi.url).filter( Strippi.sarjakuva_id==self.sarjakuva.id ).all() loaded = [i.url for i in loaded] for link in links: nimi = link["href"] src = url_fix( "{}{}".format(self.sarjakuva.last_url, nimi) ) filetype = "{}".format(nimi.split(".")[-1]) if src in loaded: continue count += 1 if not filetype in kuvat: # ei oikeanlainen kuva continue self.Save(nimi, src, filetype) return None
def __init__(self, path='/', base_url=None, query_string=None, method='GET', input_stream=None, content_type=None, content_length=None, errors_stream=None, multithread=False, multiprocess=False, run_once=False, headers=None, data=None, environ_base=None, environ_overrides=None, charset='utf-8'): path_s = make_literal_wrapper(path) if query_string is None and path_s('?') in path: path, query_string = path.split(path_s('?'), 1) self.charset = charset self.path = iri_to_uri(path) if base_url is not None: base_url = url_fix(iri_to_uri(base_url, charset), charset) self.base_url = base_url if isinstance(query_string, (bytes, str)): self.query_string = query_string else: if query_string is None: query_string = MultiDict() elif not isinstance(query_string, MultiDict): query_string = MultiDict(query_string) self.args = query_string self.method = method if headers is None: headers = Headers() elif not isinstance(headers, Headers): headers = Headers(headers) self.headers = headers if content_type is not None: self.content_type = content_type if errors_stream is None: errors_stream = sys.stderr self.errors_stream = errors_stream self.multithread = multithread self.multiprocess = multiprocess self.run_once = run_once self.environ_base = environ_base self.environ_overrides = environ_overrides self.input_stream = input_stream self.content_length = content_length self.closed = False if data: if input_stream is not None: raise TypeError('can\'t provide input stream and data') if isinstance(data, str): data = data.encode(self.charset) if isinstance(data, bytes): self.input_stream = BytesIO(data) if self.content_length is None: self.content_length = len(data) else: for key, value in _iter_data(data): if ( isinstance(value, (tuple, dict)) or hasattr(value, 'read') ): self._add_file_from_data(key, value) else: self.form.setlistdefault(key).append(value)
def get_query(self, url, sleep=0.0, force=False): # Get LastFM key and cache duration ConfigParam = self.env['ir.config_parameter'].sudo() fm_key = ConfigParam.get_param('oomusic.lastfm_key') fm_cache = int(ConfigParam.get_param('oomusic.lastfm_cache', 112)) fm_info = ConfigParam.get_param('oomusic.fm_info', 'auto') if not fm_key: return '{}' url = url_fix(url + '&api_key=' + fm_key + '&format=json').encode('utf-8') url_hash = hashlib.sha1(url).hexdigest() new_cr = self.pool.cursor() Lastfm = self.with_env(self.env(cr=new_cr)).search([('name', '=', url_hash)]) if force or not Lastfm or Lastfm.expiry_date < fields.Datetime.now(): content = '{}' if fm_info == 'manual' and not force: Lastfm.env.cr.rollback() Lastfm.env.cr.close() content = Lastfm.content or content return content try: time.sleep(sleep) r = requests.get(url, timeout=3.0) if r.status_code == 200: content = r.content.decode('utf-8') except: _logger.info('Error while fetching URL "%s"', url, exc_info=True) expiry_date = datetime.datetime.utcnow() + datetime.timedelta( days=fm_cache) removal_date = datetime.datetime.utcnow() + datetime.timedelta( days=fm_cache + 14) # Save in cache with self.pool.cursor() as cr: new_self = Lastfm.with_env(self.env(cr=cr)) if not Lastfm: writer = new_self.create else: writer = new_self.write writer({ 'name': url_hash, 'url': url, 'content': content, 'expiry_date': expiry_date.strftime(DATETIME_FORMAT), 'removal_date': removal_date.strftime(DATETIME_FORMAT), }) else: content = Lastfm.content or '{}' Lastfm.env.cr.rollback() Lastfm.env.cr.close() return content
def step1(client_id, redirect_url, scope): # 1. Send a user to authorize your app auth_url = ('''https://login.xero.com/identity/connect/authorize?''' + '''response_type=code''' + '''&client_id=''' + client_id + '''&redirect_uri=''' + redirect_url + '''&scope=''' + scope + '''&state=123''') webbrowser.open_new(url_fix(auth_url))
def test_quoting(self): assert urls.url_quote(u'\xf6\xe4\xfc') == '%C3%B6%C3%A4%C3%BC' assert urls.url_unquote(urls.url_quote(u'#%="\xf6')) == u'#%="\xf6' assert urls.url_quote_plus('foo bar') == 'foo+bar' assert urls.url_unquote_plus('foo+bar') == 'foo bar' assert urls.url_encode({'a': None, 'b': 'foo bar'}) == 'b=foo+bar' assert urls.url_fix(u'http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') == \ 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
def get_query(self, url, sleep=0.0, force=False): ConfigParam = self.env["ir.config_parameter"].sudo() sp_cache = int(ConfigParam.get_param("oomusic.spotify_cache", 182)) ext_info = ConfigParam.get_param("oomusic.ext_info", "auto") url = url_fix(url).encode("utf-8") url_hash = hashlib.sha1(url).hexdigest() Spotify = self.search([("name", "=", url_hash)]) if force or not Spotify or Spotify.expiry_date < fields.Datetime.now(): content = "{}" if ext_info == "manual" and not force: content = Spotify.content or content return content try: time.sleep(sleep) headers = { "Authorization": "Bearer {}".format( self.env["oomusic.spotify.token"]._get_token()) } r = requests.get(url, headers=headers, timeout=3.0) if r.status_code == 200: content = r.content.decode("utf-8") else: _logger.info( "Error while fetching URL '%s'. Error code: %s", url, r.status_code) return content except: _logger.info("Error while fetching URL '%s'", url, exc_info=True) return content expiry_date = datetime.datetime.utcnow() + datetime.timedelta( days=sp_cache) removal_date = datetime.datetime.utcnow() + datetime.timedelta( days=sp_cache + 14) # Save in cache if not Spotify: writer = self.create else: writer = Spotify.write writer({ "name": url_hash, "url": url, "content": content, "expiry_date": expiry_date, "removal_date": removal_date, }) self.env.cr.commit() else: content = Spotify.content or "{}" return content
def gallery(self, **kwargs): link, doc, doc_abspath = self._get_link(**kwargs) if not os.path.isdir(doc_abspath): abort(404) values = { "doc": doc, "browse": "/ooshare/browse?token={}&doc={}".format(kwargs["token"], doc), "imgs": sorted( [{ "name": f.name, "url": url_fix("/ooshare/img?token={}&doc={}".format( kwargs["token"], os.path.join(doc, f.name))), "url_thumb": url_fix("/ooshare/img?token={}&doc={}&thumb=1".format( kwargs["token"], os.path.join(doc, f.name))), } for f in os.scandir(doc_abspath) if f.is_file() and ft.guess(os.path.join(doc_abspath, f.name)) and ft.guess( os.path.join(doc_abspath, f.name)).extension in IMG_EXT], key=lambda d: d["name"], ), "vids": sorted( [{ "name": f.name, "url": url_fix("/ooshare/vid?token={}&doc={}".format( kwargs["token"], os.path.join(doc, f.name))), "mime": VID_EXT[os.path.splitext(f.name)[1][1:]], } for f in os.scandir(doc_abspath) if f.is_file() and os.path.splitext(f.name)[1][1:] in VID_EXT ], key=lambda d: d["name"], ), } res = request.render("ooshare.gallery", values) return res
def proxy(url): #url 不能为unicode,要转为utf-8 #url = url.encode('utf-8') url = url_fix(url) try: content = urllib2.urlopen(url, timeout=120).read() return content except: return u'error'
def coerce_url(url: str, https: bool = True) -> str: """ Coerce URL to valid format :param url: URL :param https: Force https if no scheme in url :return: str """ url.strip() if url.startswith("feed://"): return url_fix("http://{0}".format(url[7:])) for proto in ["http://", "https://"]: if url.startswith(proto): return url_fix(url) if https: return url_fix("https://{0}".format(url)) else: return url_fix("http://{0}".format(url))
def run(self, url, year, created): """Run Celery Task. """ self.job_id = self.request.id self.url = url_fix(url) time_started = datetime.now() self._log.info('Start calculating simhashes.') self.download_errors = 0 if not self.url: self._log.error('did not give url parameter') return {'status': 'error', 'info': 'URL is required.'} if not year: self._log.error('did not give year parameter') return {'status': 'error', 'info': 'Year is required.'} # fetch captures self.update_state( state='PENDING', meta={'info': 'Fetching %s captures for year %s' % (url, year)}) resp = self.fetch_cdx(url, year) if resp.get('status') == 'error': return resp captures = resp.get('captures') total = len(captures) self.seen = dict() # calculate simhashes in parallel i = 0 final_results = {} for res in self.tpool.map(self.get_calc, captures): if not res: continue (timestamp, simhash) = res if simhash: final_results[timestamp] = simhash if i % 10 == 0: self.update_state(state='PENDING', meta={ 'info': 'Processed %d out of %d captures.' % (i, total) }) i += 1 self._log.info('%d final results for %s and year %s.', len(final_results), self.url, year) if final_results: try: urlkey = surt(self.url) self.redis.hmset(urlkey, final_results) self.redis.expire(urlkey, self.simhash_expire) except RedisError as exc: self._log.error('cannot write simhashes to Redis for URL %s', self.url, exc_info=1) duration = (datetime.now() - time_started).seconds self._log.info('Simhash calculation finished in %.2fsec.', duration) return {'duration': str(duration)}
def post_url(): data = request.get_json() url = url_fix(data['url']) slug = r.get(url_prefix + url) if not slug: slug = base62.encode(r.incr('next_url_id')) r.hmset(slug_prefix + slug, {'url': url, 'visited': 0}) r.set(url_prefix + url, slug) return jsonify({'url': url, 'slug': slug})
def wiki_link(self, addr, label=None, class_=None, image=None, lineno=0): """Create HTML for a wiki link.""" addr = addr.strip() text = escape(label or addr) chunk = '' if class_ is not None: classes = [class_] else: classes = [] if hatta.parser.external_link(addr): classes.append('external') if addr.startswith('mailto:'): # Obfuscate e-mails a little bit. classes.append('mail') text = text.replace('@', '@').replace('.', '.') href = escape(addr).replace('@', '%40').replace('.', '%2E') else: href = escape(url_fix(addr)) else: if '#' in addr: addr, chunk = addr.split('#', 1) chunk = '#' + url_fix(chunk) if addr.startswith(':'): alias = self.link_alias(addr[1:]) href = escape(url_fix(alias) + chunk) classes.append('external') classes.append('alias') elif addr.startswith('+'): href = '/'.join( [self.request.script_root, '+' + escape(addr[1:])]) classes.append('special') elif addr == '': href = escape(chunk) classes.append('anchor') else: classes.append('wiki') href = escape(self.get_url(addr) + chunk) if addr not in self.storage: classes.append('nonexistent') class_ = escape(' '.join(classes) or '') # We need to output HTML on our own to prevent escaping of href return '<a href="%s" class="%s" title="%s">%s</a>' % ( href, class_, escape(addr + chunk), image or text)
def request(url): """ default call to the api call http://endpoint/v1/{url} return the response and the url called (it might have been modified with the normalization) """ norm_url = url_fix(_api_current_root_point + url) # normalize url raw_response = requests.get(norm_url) return json.loads(raw_response.text), norm_url, raw_response.status_code
def download_capture(self, ts): """Download capture from WBM and update job status. Return capture body (probably HTML text) """ try: self._log.info('fetching capture %s %s', ts, self.url) resp = self.http.request('GET', '/web/%sid_/%s' % (ts, url_fix(self.url))) return resp.data.decode('utf-8', 'ignore') except HTTPError as exc: self._log.error('cannot fetch capture %s %s (%s)', ts, self.url, exc)
def save(cls, destination_mapping: str) -> SlugMapping: """Save a destination mapping as a UrlMapping and return the SlugMapping.""" record = UrlMapping( slug=cls.generate_slug(destination_mapping=destination_mapping), destination_url=url_fix(destination_mapping), ) record.save() return SlugMapping( slug=record.slug, destination_url=record.destination_url, created_at=record.created_at, )
def get_query(self, url, sleep=0.0, force=False): # Get LastFM key and cache duration ConfigParam = self.env["ir.config_parameter"].sudo() fm_key = ConfigParam.get_param("oomusic.lastfm_key") fm_cache = int(ConfigParam.get_param("oomusic.lastfm_cache", 112)) ext_info = ConfigParam.get_param("oomusic.ext_info", "auto") if not fm_key: return "{}" url = url_fix(url + "&api_key=" + fm_key + "&format=json").encode("utf-8") url_hash = hashlib.sha1(url).hexdigest() Lastfm = self.search([("name", "=", url_hash)]) if force or not Lastfm or Lastfm.expiry_date < fields.Datetime.now(): content = "{}" if ext_info == "manual" and not force: content = Lastfm.content or content return content try: time.sleep(sleep) r = requests.get(url, timeout=3.0) if r.status_code == 200: content = r.content.decode("utf-8") except: _logger.info('Error while fetching URL "%s"', url, exc_info=True) expiry_date = datetime.datetime.utcnow() + datetime.timedelta( days=fm_cache) removal_date = datetime.datetime.utcnow() + datetime.timedelta( days=fm_cache + 14) # Save in cache if not Lastfm: writer = self.create else: writer = Lastfm.write writer({ "name": url_hash, "url": url, "content": content, "expiry_date": expiry_date, "removal_date": removal_date, }) self.env.cr.commit() else: content = Lastfm.content or "{}" return content
def fix_license_and_urls(data_frame): """ Creates license URL and display name and fixes broken URLs from see_also_links and reference_list. :param data_frame: DataFrame to perform fixing operation on :return: Fixed DataFrame """ # JOB: Transform license object to string license_base = 'https://choosealicense.com/licenses/' data_frame['license_url'] = data_frame['license'].apply( lambda lic: license_base + lic.get('key') if lic else None) data_frame['license'] = data_frame['license'].apply( lambda lic: lic.get('name') if lic else None) # JOB: Fix potentially broken URLs data_frame['see_also_links'] = data_frame['see_also_links'].apply( lambda ref_list: [url_fix(ref_link) for ref_link in ref_list]) data_frame['reference_list'] = data_frame['reference_list'].apply( lambda ref_list: [url_fix(ref_link) for ref_link in ref_list]) return data_frame
def getMarketItems(url, count, currency, start=0): if not url.startswith('http://') and not url.startswith('https://'): url = 'http://' + url url = url_fix(url) curr = CURRENCY[currency][0] urlextender = '/render/?query=&start=%s&count=%s¤cy=%s' % ( start, count, curr) try: request = requests.get(url + urlextender) except requests.ConnectionError: return 'Could not connect. Check URL and make sure you can connect to the internet.', None except exceptions.InvalidURL: return 'URL is invalid, please check your market URL.', None if request.status_code == 404: return 'Could not connect to Steam. Retry in a few minutes and check URL.', None if len(request.text) < 1000: return 'Response from Steam contains no skin data, URL is probably invalid.', None if request.url != url + urlextender: return 'Page redirected to %s, so no skins were found. Check your market URL.' % request.url, None data = request.text.split('"listinginfo":')[1].split(',"assets":')[0] try: data = json.loads(data, object_pairs_hook=OrderedDict) except ValueError: return 'Response from Steam contains no skin data, URL is probably invalid.', None # assetID => [marketID, inspect link, formatted price] datadic = OrderedDict() soldcount = 0 for marketID in data: try: price = int(data[marketID]['converted_price']) + int( data[marketID]['converted_fee']) padded = "%03d" % (price, ) price = padded[0:-2] + '.' + padded[-2:] except KeyError: price = 'SOLD' soldcount += 1 continue # Delete this line to keep SOLD ITEMS in the result link = data[marketID]['asset']['market_actions'][0]['link'] assetID = data[marketID]['asset']['id'] datadic[assetID] = [ marketID, link.replace('%assetid%', assetID).replace('%listingid%', marketID), price ] return datadic, soldcount
def handleWord(word): g.word = WordDict(word) url = 'http://dictionaryapi.com/api/v1/references/collegiate/xml/%s?key=%s' % ( word, API_KEY) xml = urllib2.urlopen(url_fix(url)) try: dom = parse(xml) except: return g.word if dom.getElementsByTagName('entry'): handleEntries(dom.getElementsByTagName('entry')) else: handleSuggestion(dom.getElementsByTagName('suggestion')) return g.word