def _get_scrape_url(link): if not link.is_self: sr_name = link.subreddit_slow.name if not feature.is_enabled("imgur_gif_conversion", subreddit=sr_name): return link.url p = UrlParser(link.url) # If it's a gif link on imgur, replacing it with gifv should # give us the embedly friendly video url if is_subdomain(p.hostname, "imgur.com"): if p.path_extension().lower() == "gif": p.set_extension("gifv") return p.unparse() return link.url urls = extract_urls_from_markdown(link.selftext) second_choice = None for url in urls: p = UrlParser(url) if p.is_reddit_url(): continue # If we don't find anything we like better, use the first image. if not second_choice: second_choice = url # This is an optimization for "proof images" in AMAs. if is_subdomain(p.netloc, 'imgur.com') or p.has_image_extension(): return url return second_choice
def test_default_prefix(self): u = UrlParser('http://i.reddit.com/r/redditdev') u.switch_subdomain_by_extension() self.assertEquals('http://www.reddit.com/r/redditdev', u.unparse()) u = UrlParser('http://i.reddit.com/r/redditdev') u.switch_subdomain_by_extension('does-not-exist') self.assertEquals('http://www.reddit.com/r/redditdev', u.unparse())
def test_url_mutation(self): u = UrlParser("http://example.com/") u.hostname = g.domain self.assertTrue(u.is_reddit_url()) u = UrlParser("http://%s/" % g.domain) u.hostname = "example.com" self.assertFalse(u.is_reddit_url())
def make_anchored_permalink(self, link=None, sr=None, context=1, anchor=None): if link: permalink = UrlParser(self.make_permalink(link, sr)) else: permalink = UrlParser(self.make_permalink_slow()) permalink.update_query(context=context) permalink.fragment = anchor if anchor else self._id36 return permalink.unparse()
def test_normal_urls(self): u = UrlParser('http://www.reddit.com/r/redditdev') u.switch_subdomain_by_extension('compact') result = u.unparse() self.assertEquals('http://i.reddit.com/r/redditdev', result) u = UrlParser(result) u.switch_subdomain_by_extension('mobile') result = u.unparse() self.assertEquals('http://simple.reddit.com/r/redditdev', result)
def test_same_url(self): u = UrlParser('http://example.com:8000/a;b?foo=bar&bar=baz#spam') u2 = UrlParser('http://example.com:8000/a;b?bar=baz&foo=bar#spam') self.assertEquals(u, u2) u3 = UrlParser('') u3.scheme = 'http' u3.hostname = 'example.com' u3.port = 8000 u3.path = '/a' u3.params = 'b' u3.update_query(foo='bar', bar='baz') u3.fragment = 'spam' self.assertEquals(u, u3)
def validate_secure_oembed(self, oembed): """Check the "secure" embed is safe to embed, and not a placeholder""" if not oembed.get("html"): return False # Get the embed.ly iframe's src iframe_src = lxml.html.fromstring(oembed['html']).get('src') if not iframe_src: return False iframe_src_url = UrlParser(iframe_src) # Per embed.ly support: If the URL for the provider is HTTP, we're # gonna get a placeholder image instead provider_src_url = UrlParser(iframe_src_url.query_dict.get('src')) return not provider_src_url.scheme or provider_src_url.scheme == "https"
def test_sign_url(self): u = UrlParser('http://examples.imgix.net/frog.jpg?w=100') signed_url = self.provider._sign_url(u, 'abcdef') self.assertEqual( signed_url.unparse(), 'http://examples.imgix.net/frog.jpg?w=100&s=cd3bdf071108af73b15c21bdcee5e49c' ) u = UrlParser('http://examples.imgix.net/frog.jpg') u.update_query(w=100) signed_url = self.provider._sign_url(u, 'abcdef') self.assertEqual( signed_url.unparse(), 'http://examples.imgix.net/frog.jpg?w=100&s=cd3bdf071108af73b15c21bdcee5e49c' )
def _do_content_purge(self, url): """Does the purge of the content from CloudFlare.""" data = { 'files': [ url, ] } timer = g.stats.get_timer("providers.cloudflare.content_purge") timer.start() # Get the proper zone id for the purge cache url based on hostname if UrlParser(url).hostname == g.image_hosting_domain: purge_key_url = g.secrets['cloudflare_purge_key_imagehosting_url'] else: purge_key_url = g.secrets['cloudflare_purge_key_url'] response = requests.delete( purge_key_url, headers={ 'X-Auth-Email': g.secrets['cloudflare_email_address'], 'X-Auth-Key': g.secrets['cloudflare_api_key'], 'content-type': 'application/json', }, data=json.dumps(data), ) timer.stop()
def POST_options(self, all_langs, pref_lang, **kw): #temporary. eventually we'll change pref_clickgadget to an #integer preference kw['pref_clickgadget'] = kw['pref_clickgadget'] and 5 or 0 if c.user.pref_show_promote is None: kw['pref_show_promote'] = None elif not kw.get('pref_show_promote'): kw['pref_show_promote'] = False if not kw.get("pref_over_18") or not c.user.pref_over_18: kw['pref_no_profanity'] = True if kw.get("pref_no_profanity") or c.user.pref_no_profanity: kw['pref_label_nsfw'] = True if not c.user.gold: kw['pref_show_adbox'] = True kw['pref_show_sponsors'] = True self.set_options(all_langs, pref_lang, **kw) u = UrlParser(c.site.path + "prefs") u.update_query(done='true') if c.cname: u.put_in_frame() return self.redirect(u.unparse())
def POST_options(self, all_langs, pref_lang, **kw): #temporary. eventually we'll change pref_clickgadget to an #integer preference kw['pref_clickgadget'] = kw['pref_clickgadget'] and 5 or 0 if c.user.pref_show_promote is None: kw['pref_show_promote'] = None elif not kw.get('pref_show_promote'): kw['pref_show_promote'] = False if not kw.get("pref_over_18") or not c.user.pref_over_18: kw['pref_no_profanity'] = True if kw.get("pref_no_profanity") or c.user.pref_no_profanity: kw['pref_label_nsfw'] = True # default all the gold options to on if they don't have gold if not c.user.gold: for pref in ('pref_show_adbox', 'pref_show_sponsors', 'pref_show_sponsorships', 'pref_highlight_new_comments', 'pref_monitor_mentions'): kw[pref] = True self.set_options(all_langs, pref_lang, **kw) u = UrlParser(c.site.path + "prefs") u.update_query(done='true') if c.cname: u.put_in_frame() return self.redirect(u.unparse())
def _update_redirect_uri(base_redirect_uri, params, as_fragment=False): parsed = UrlParser(base_redirect_uri) if as_fragment: parsed.fragment = urlencode(params) else: parsed.update_query(**params) return parsed.unparse()
def process(link): assert link.thing_type == 'link' timestamp = link.timestamp fname = make_fullname(Link, link.thing_id) if not link.spam and not link.deleted: if link.url: domains = UrlParser(link.url).domain_permutations() else: domains = [] ups, downs = link.ups, link.downs for tkey, oldest in oldests.iteritems(): if timestamp > oldest: sc = score(ups, downs) contr = controversy(ups, downs) h = _hot(ups, downs, timestamp) for domain in domains: yield ('domain/top/%s/%s' % (tkey, domain), sc, timestamp, fname) yield ('domain/controversial/%s/%s' % (tkey, domain), contr, timestamp, fname) if tkey == "all": yield ('domain/hot/%s/%s' % (tkey, domain), h, timestamp, fname) yield ('domain/new/%s/%s' % (tkey, domain), timestamp, timestamp, fname)
def GET_framebuster(self, what = None, blah = None): """ renders the contents of the iframe which, on a cname, checks if the user is currently logged into reddit. if this page is hit from the primary domain, redirects to the cnamed domain version of the site. If the user is logged in, this cnamed version will drop a boolean session cookie on that domain so that subsequent page reloads will be caught in middleware and a frame will be inserted around the content. If the user is not logged in, previous session cookies will be emptied so that subsequent refreshes will not be rendered in that pesky frame. """ if not c.site.domain: return "" elif c.cname: return FrameBuster(login = (what == "login")).render() else: path = "/framebuster/" if c.user_is_loggedin: path += "login/" u = UrlParser(path + str(random.random())) u.mk_cname(require_frame = False, subreddit = c.site, port = request.port) return self.redirect(u.unparse()) # the user is not logged in or there is no cname. return FrameBuster(login = False).render()
def _process_data(self, wiki_xml): """This method processes the wiki data and extracts what is used""" MEDIAWIKI_NS = 'http://www.mediawiki.org/xml/export-0.3/' sequences = [] lw_url_re = re.compile(r'\[(http://lesswrong\.com/lw/[^ ]+) [^\]]+\]') for page in wiki_xml.getroot().iterfind( './/{%s}page' % MEDIAWIKI_NS): # TODO: Change to use iterparse # Get the titles title = page.findtext('{%s}title' % MEDIAWIKI_NS) # See if this page is a sequence page sequence_elem = page.xpath( "mw:revision[1]/mw:text[contains(., '[[Category:Sequences]]')]", namespaces={'mw': MEDIAWIKI_NS}) if sequence_elem: sequence_elem = sequence_elem[0] articles = [] # Find all the lesswrong urls for match in lw_url_re.finditer(sequence_elem.text): article_url = UrlParser(match.group(1)) # Only store the path to the article article_path = article_url.path # Ensure path ends in slash if article_path[-1] != '/': article_path += '/' articles.append(article_path) sequences.append({'title': title, 'articles': articles}) return {'sequences': sequences}
def process_message(msgs, chan): """Update get_domain_links(), the Links by domain precomputed query. get_domain_links() is a CachedResult which is stored in permacache. To update these objects we need to do a read-modify-write which requires obtaining a lock. Sharding these updates by domain allows us to run multiple consumers (but ideally just one per shard) to avoid lock contention. """ from r2.lib.db.queries import add_queries, get_domain_links link_names = {msg.body for msg in msgs} links = Link._by_fullname(link_names, return_dict=False) print 'Processing %r' % (links, ) links_by_domain = defaultdict(list) for link in links: parsed = UrlParser(link.url) # update the listings for all permutations of the link's domain for domain in parsed.domain_permutations(): links_by_domain[domain].append(link) for d, links in links_by_domain.iteritems(): with g.stats.get_timer("link_vote_processor.domain_queries"): add_queries( queries=[ get_domain_links(d, sort, "all") for sort in SORTS ], insert_items=links, )
def resize_image(self, image, width=None, censor_nsfw=False, max_ratio=None): url = UrlParser(image['url']) url.hostname = g.imgix_domain # Let's encourage HTTPS; it's cool, works just fine on HTTP pages, and # will prevent insecure content warnings on HTTPS pages. url.scheme = 'https' if max_ratio: url.update_query(fit='crop') # http://www.imgix.com/docs/reference/size#param-crop url.update_query(crop='faces,entropy') url.update_query(arh=max_ratio) if width: if width > image['width']: raise NotLargeEnough() # http://www.imgix.com/docs/reference/size#param-w url.update_query(w=width) if censor_nsfw: # Do an initial blur to make sure we're getting rid of icky # details. # # http://www.imgix.com/docs/reference/stylize#param-blur url.update_query(blur=600) # And then add pixellation to help the image compress well. # # http://www.imgix.com/docs/reference/stylize#param-px url.update_query(px=32) if g.imgix_signing: url = self._sign_url(url, g.secrets['imgix_signing_token']) return url.unparse()
def add_sr(path, sr_path=True, nocname=False, force_hostname=False): """ Given a path (which may be a full-fledged url or a relative path), parses the path and updates it to include the subreddit path according to the rules set by its arguments: * force_hostname: if True, force the url's hotname to be updated even if it is already set in the path, and subject to the c.cname/nocname combination. If false, the path will still have its domain updated if no hostname is specified in the url. * nocname: when updating the hostname, overrides the value of c.cname to set the hotname to g.domain. The default behavior is to set the hostname consistent with c.cname. * sr_path: if a cname is not used for the domain, updates the path to include c.site.path. """ u = UrlParser(path) if sr_path and (nocname or not c.cname): u.path_add_subreddit(c.site) if not u.hostname or force_hostname: u.hostname = get_domain(cname=(c.cname and not nocname), subreddit=False) if c.render_style == 'mobile': u.set_extension('mobile') return u.unparse()
def url_for_title(self, title): """Uses the MediaWiki API to get the URL for a wiki page with the given title""" if title is None: return None from pylons import g cache_key = ('wiki_url_%s' % title).encode('ascii', 'ignore') wiki_url = g.cache.get(cache_key) if wiki_url is None: # http://www.mediawiki.org/wiki/API:Query_-_Properties#info_.2F_in api = UrlParser(g.wiki_api_url) api.update_query(action='query', titles=title, prop='info', format='yaml', inprop='url') try: response = urlopen(api.unparse()).read() parsed_response = yaml.load(response, Loader=yaml.CLoader) page = parsed_response['query']['pages'][0] except: return None wiki_url = page.get('fullurl').strip() # Things are created every couple of days so 12 hours seems # to be a reasonable cache time g.permacache.set(cache_key, wiki_url, time=3600 * 12) return wiki_url
def format_output_url(cls, url, **kw): """ Helper method used during redirect to ensure that the redirect url (assisted by frame busting code or javasctipt) will point to the correct domain and not have any extra dangling get parameters. The extensions are also made to match and the resulting url is utf8 encoded. Node: for development purposes, also checks that the port matches the request port """ u = UrlParser(url) if u.is_reddit_url(): # make sure to pass the port along if not 80 if not kw.has_key('port'): kw['port'] = request.port # disentagle the cname (for urls that would have # cnameframe=1 in them) u.mk_cname(**kw) # make sure the extensions agree with the current page if c.extension: u.set_extension(c.extension) # unparse and encode it un utf8 rv = _force_unicode(u.unparse()).encode('utf8') if "\n" in rv or "\r" in rv: abort(400) return rv
def purge_url(self, url): """Purge an image (by url) from imgix. Reference: http://www.imgix.com/docs/tutorials/purging-images Note that as mentioned in the imgix docs, in order to remove an image, this function should be used *after* already removing the image from our source, or imgix will just re-fetch and replace the image with a new copy even after purging. """ p = UrlParser(url) if p.hostname == g.imgix_domain: p.hostname = g.imgix_purge_domain elif p.hostname == g.imgix_gif_domain: p.hostname = g.imgix_gif_purge_domain url = p.unparse() requests.post( "https://api.imgix.com/v2/image/purger", auth=(g.secrets["imgix_api_key"], ""), data={"url": url}, )
def POST_request_promo(self, srnames, is_mobile_web, platform, loid, is_refresh): self.OPTIONS_request_promo() if not srnames: return # backwards compat if platform is None: platform = "mobile_web" if is_mobile_web else "desktop" srnames = srnames.split('+') # request multiple ads in case some are hidden by the builder due # to the user's hides/preferences response = adzerk_request(srnames, self.get_uid(loid), platform=platform) if not response: g.stats.simple_event('adzerk.request.no_promo') return # for adservers, adzerk returns markup so we pass it to the client if isinstance(response, AdserverResponse): g.stats.simple_event('adzerk.request.adserver') return responsive(response.body) res_by_campaign = {r.campaign: r for r in response} adserver_click_urls = {r.campaign: r.click_url for r in response} tuples = [promote.PromoTuple(r.link, 1., r.campaign) for r in response] builder = CampaignBuilder(tuples, wrap=default_thing_wrapper(), keep_fn=promote.promo_keep_fn, num=1, skip=True) listing = LinkListing(builder, nextprev=False).listing() promote.add_trackers(listing.things, c.site, adserver_click_urls=adserver_click_urls) promote.update_served(listing.things) if listing.things: g.stats.simple_event('adzerk.request.valid_promo') if is_refresh: g.stats.simple_event('adzerk.request.auto_refresh') w = listing.things[0] r = res_by_campaign[w.campaign] up = UrlParser(r.imp_pixel) up.hostname = "pixel.redditmedia.com" w.adserver_imp_pixel = up.unparse() w.adserver_upvote_pixel = r.upvote_pixel w.adserver_downvote_pixel = r.downvote_pixel w.adserver_click_url = r.click_url w.num = "" return responsive(w.render(), space_compress=True) else: g.stats.simple_event('adzerk.request.skip_promo')
def allowed_media_preview_url(url): p = UrlParser(url) if p.has_static_image_extension(): return True for allowed_domain in g.media_preview_domain_whitelist: if is_subdomain(p.hostname, allowed_domain): return True return False
def GET_framebuster(self): if c.site.domain and c.user_is_loggedin: u = UrlParser(c.site.path + "/frame") u.put_in_frame() c.cname = True return self.redirect(u.unparse()) return "fail"
def resize_image(self, image, width=None, file_type=None, censor_nsfw=False, max_ratio=None): url = UrlParser(image['url']) is_gif = url.path.endswith('.gif') and (file_type == 'mp4' or not file_type) if is_gif: url.hostname = g.imgix_gif_domain else: url.hostname = g.imgix_domain # Let's encourage HTTPS; it's cool, works just fine on HTTP pages, and # will prevent insecure content warnings on HTTPS pages. url.scheme = 'https' # g.s3_media_direct affects how preview image urls are stored # True: http://{s3_media_domain}/mybucket/helloworld.jpg # False: http://mybucket/helloworld.jpg # If it's True, we'll need to strip the bucket out of the path if g.s3_media_direct: path_parts = url.path.split('/') path_parts.pop(1) url.path = '/'.join(path_parts) if max_ratio: url.update_query(fit='crop') # http://www.imgix.com/docs/reference/size#param-crop url.update_query(crop='faces,entropy') url.update_query(arh=max_ratio) if width: if width > image['width']: raise NotLargeEnough() # http://www.imgix.com/docs/reference/size#param-w url.update_query(w=width) if file_type and file_type in ('gif', 'jpg', 'png', 'mp4'): url.update_query(fm=file_type) # We need to disable fragmented mp4s for proper playback in Firefox if file_type == 'mp4': url.update_query(**{'mp4-fragmented': 'false'}) if censor_nsfw: # Do an initial blur to make sure we're getting rid of icky # details. # # http://www.imgix.com/docs/reference/stylize#param-blur url.update_query(blur=600) # And then add pixellation to help the image compress well. # # http://www.imgix.com/docs/reference/stylize#param-px url.update_query(px=32) if g.imgix_signing: if is_gif: url = self._sign_url(url, g.secrets['imgix_gif_signing_token']) else: url = self._sign_url(url, g.secrets['imgix_signing_token']) return url.unparse()
def add_sr(path, sr_path=True, nocname=False, force_hostname=False, retain_extension=True, force_https=False, force_extension=None): """ Given a path (which may be a full-fledged url or a relative path), parses the path and updates it to include the subreddit path according to the rules set by its arguments: * sr_path: if a cname is not used for the domain, updates the path to include c.site.path. * nocname: deprecated. * force_hostname: if True, force the url's hostname to be updated even if it is already set in the path. If false, the path will still have its domain updated if no hostname is specified in the url. * retain_extension: if True, sets the extention according to c.render_style. * force_https: force the URL scheme to https For caching purposes: note that this function uses: c.render_style, c.site.name """ # don't do anything if it is just an anchor if path.startswith(('#', 'javascript:')): return path u = UrlParser(path) if sr_path: u.path_add_subreddit(c.site) if not u.hostname or force_hostname: u.hostname = get_domain(subreddit=False) if (c.secure and u.is_reddit_url()) or force_https: u.scheme = "https" if force_extension is not None: u.set_extension(force_extension) elif retain_extension: if c.render_style == 'mobile': u.set_extension('mobile') elif c.render_style == 'compact': u.set_extension('compact') # SaidIt CUSTOM elif c.render_style == g.extension_subdomain_mobile_v2_render_style: u.set_extension(g.extension_subdomain_mobile_v2_render_style) return u.unparse()
def maps_from_things(things, boost_only=False): """We only know how to do links for now""" maps = [] if not boost_only: # we can avoid looking these up at all if only the boosts were # updated author_ids = [ thing.author_id for thing in things if hasattr(thing, 'author_id') ] accounts = Account._byID(author_ids, data=True, return_dict=True) sr_ids = [thing.sr_id for thing in things if hasattr(thing, 'sr_id')] srs = Subreddit._byID(sr_ids, data=True, return_dict=True) for thing in things: try: d = dict(fullname=thing._fullname, ups=thing._ups, downs=thing._downs, num_comments=getattr(thing, 'num_comments', 0)) if not boost_only: a = accounts[thing.author_id] sr = srs[thing.sr_id] if a._deleted: # if the author was deleted, we won't updated it in # indextank at all continue d.update( dict( fullname=thing._fullname, subreddit=sr.name, reddit=sr.name, text=' '.join([thing.title, a.name, sr.name]), author=a.name, timestamp=thing._date.strftime("%s"), sr_id=str(thing.sr_id), over18=yesno(sr.over_18), is_self=yesno(thing.is_self), )) if thing.is_self: d['site'] = g.domain if thing.selftext: d['selftext'] = thing.selftext else: d['url'] = thing.url d['site'] = ' '.join( UrlParser(thing.url).domain_permutations()) maps.append(d) except AttributeError: pass return maps
def add_sr(path, sr_path=True, nocname=False, force_hostname=False, retain_extension=True, force_https=False): """ Given a path (which may be a full-fledged url or a relative path), parses the path and updates it to include the subreddit path according to the rules set by its arguments: * sr_path: if a cname is not used for the domain, updates the path to include c.site.path. * nocname: when updating the hostname, overrides the value of c.cname to set the hostname to g.domain. The default behavior is to set the hostname consistent with c.cname. * force_hostname: if True, force the url's hostname to be updated even if it is already set in the path, and subject to the c.cname/nocname combination. If false, the path will still have its domain updated if no hostname is specified in the url. * retain_extension: if True, sets the extention according to c.render_style. * force_https: force the URL scheme to https For caching purposes: note that this function uses: c.cname, c.render_style, c.site.name """ # don't do anything if it is just an anchor if path.startswith(('#', 'javascript:')): return path u = UrlParser(path) if sr_path and (nocname or not c.cname): u.path_add_subreddit(c.site) if not u.hostname or force_hostname: if c.secure: u.hostname = request.host else: u.hostname = get_domain(cname=(c.cname and not nocname), subreddit=False) if (c.secure and u.is_reddit_url()) or force_https: u.scheme = "https" if retain_extension: if c.render_style == 'mobile': u.set_extension('mobile') elif c.render_style == 'compact': u.set_extension('compact') return u.unparse()
def _key_from_url(cls, url): if not utils.domain(url) in g.case_sensitive_domains: keyurl = _force_utf8(UrlParser.base_url(url.lower())) else: # Convert only hostname to lowercase up = UrlParser(url) up.hostname = up.hostname.lower() keyurl = _force_utf8(UrlParser.base_url(up.unparse())) return keyurl
def redirect_to_host(hostname, path=None): """Redirect (302) to the specified path and host.""" if path is None: path = request.path u = UrlParser(path) u.hostname = hostname # 307 redirect so request method is retained abort(307, location=u.unparse())