def get_data(self, argv): # argv = {"urls" : [], "worker" : , } content = None error_code = None self.logger.debug("start fetch " + argv["url"]) try: url = argv["url"] try: with eventlet.Timeout(self.timeout, False): headers = { "User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1" } if self.proxy is None: req = urllib2.Request(url, headers = headers) res = urllib2.urlopen(req) content = res.read() else: proxy_handler = urllib2.ProxyHandler(self.proxy) opener = urllib2.build_opener(proxy_handler) header_list = [] for header in headers: header_list.append((header, headers[header])) opener.addheaders = header_list res = opener.open(url) content = res.read() except urllib2.HTTPError, e: raise Exception(e.code) except urllib2.URLError, e: raise Exception("URLError")
def get_sync(self, url, data, type=None, content_type="application/x-www-form-urlencoded"): try: data = urlencode(data) except: # data is probably a string to be send directly. pass headers = {"Content-Type": content_type} if type and type.upper() not in ("POST", "GET"): from restlib import RestfulRequest #@UnresolvedImport req = RestfulRequest(url, data=data, method=type.upper()) else: req = urllib2.Request(url, data, headers=headers) opener = urllib2.build_opener(self) eventlet.greenthread.sleep() try: f = opener.open(req, data=data) if f.code is None or str(f.code)[0] == "2": dispatcher.send(UrlGetter.HTTP_RESULT, self, result=f.read(), source=url, code=f.code) else: e = urllib2.HTTPError( url, f.code, "A code %s HTTP error has occurred when trying to send to target %s" % (f.code, url), req.headers, f) dispatcher.send(UrlGetter.HTTP_ERROR, self, exception=e) # TODO: make sure we're supposed to listen to URLErrors except (urllib2.URLError, ValueError), e: dispatcher.send(UrlGetter.URL_ERROR, self, exception=e, url=url)
def handle404(self, reqorig, url, container, obj): """ Return a webob.Response which fetches the thumbnail from the thumb host, potentially writes it out to Swift so we don't 404 next time, and returns it. Note also that the thumb host might write it out to Swift so we don't have to. """ # go to the thumb media store for unknown files reqorig.host = self.thumbhost # upload doesn't like our User-agent, otherwise we could call it # using urllib2.url() opener = urllib2.build_opener() opener.addheaders = [('User-agent', self.user_agent)] # At least in theory, we shouldn't be handing out links to originals # that we don't have (or in the case of thumbs, can't generate). # However, someone may have a formerly valid link to a file, so we # should do them the favor of giving them a 404. try: upcopy = opener.open(reqorig.url) except urllib2.HTTPError,status: if status == 404: resp = webob.exc.HTTPNotFound('Expected original file not found') return resp else: resp = webob.exc.HTTPNotFound('Unexpected error %s' % status) return resp
def get_plugin_status(args): plugin, host, request = args url = "%s/plugins/%s/%d/_s/status" % (host, plugin.plugin_name, plugin.id) json = None jail_status = notifier().pluginjail_running(pjail=plugin.plugin_jail) if not jail_status: return plugin, json, jail_status try: opener = urllib2.build_opener() opener.addheaders = [ ('Cookie', 'sessionid=%s' % (request.COOKIES.get("sessionid", ''), )) ] #TODO: Increase timeout based on number of plugins response = opener.open(url, None, 5).read() json = simplejson.loads(response) except Exception, e: log.warn( _("Couldn't retrieve %(url)s: %(error)s") % { 'url': url, 'error': e, })
def plugin_fetch(args): plugin, host, request = args data = None url = "%s/plugins/%s/%d/_s/treemenu" % ( host, plugin.plugin_name, plugin.id ) try: opener = urllib2.build_opener() opener.addheaders = [( 'Cookie', 'sessionid=%s' % ( request.COOKIES.get("sessionid", ''), ) )] # TODO: Increase timeout based on number of plugins response = opener.open(url, None, 5) data = response.read() if not data: log.warn(_("Empty data returned from %s") % (url,)) except Exception, e: log.warn(_("Couldn't retrieve %(url)s: %(error)s") % { 'url': url, 'error': e, })
def _getAuthOpener_LXP150(self, http_user, http_pass): ''' Create an authenticated opener for the LXPx50 series. The LXPx50 HTTP authentication is again weird. First, a request must be sent to the phone with a Cookie with a SessionId set to a random number between 0 and 99999. Sending 0 works just as well. The first request must be a GET that asks the phone to calculate a hash for a specified username and password. The hash is embedded inside a HTML fragment in the response. Next the hash must be sent as a new Cookie in a POST request that also includes the original SessionId number and the UserName as cookies. A successful login returns a response with the phone status, including the phone model. Additionally, the response after a successful login includes a brand new SessionId that must be replaced in the opener cookie. ''' cookiejar = cookielib.CookieJar(cookielib.DefaultCookiePolicy(rfc2965=True)) sesscookie = cookielib.Cookie(None, 'SessionId', '0', None, False, self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)), False, 'SessionId', None, None) cookiejar.set_cookie(sesscookie) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) response = opener.open('http://' + self._ip + '/fcgi/do?' + urllib.urlencode({ 'action': 'Encrypt', 'UserName' : http_user, 'Password' : http_pass})) body = response.read() m = re.search(r"id=hcSingleResult type=hidden value='(.+?)'", body) if m is None: return (None, None) encrypted_password = m.group(1) sesscookie = cookielib.Cookie(None, 'UserName', http_user, None, False, self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)), False, 'UserName', None, None) cookiejar.set_cookie(sesscookie) sesscookie = cookielib.Cookie(None, 'Password', encrypted_password, None, False, self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)), False, 'Password', None, None) cookiejar.set_cookie(sesscookie) response = opener.open('http://' + self._ip + '/fcgi/do?id=1', 'SubmitData=begin%26Operation%3DCreateSession%26DestURL%3Did%6021%26SubmitData%3Dend') # Find new SessionId value. What, no Set-Cookie header? body = response.read() m = re.search(r"id=hcSessionIdNow type=hidden value='(.+?)'", body) if m != None: sesscookie = cookielib.Cookie(None, 'SessionId', m.group(1), None, False, self._ip, False, False, '/', False, False, str((int)(time.time() + 3600)), False, 'SessionId', None, None) cookiejar.set_cookie(sesscookie) else: logging.error('Endpoint %s@%s LXPx50 failed to authenticate - new session ID not found in response' % (self._vendorname, self._ip)) return (None, None) # Subsequent requests must NOT have the UserName/Password cookies cookiejar.clear(self._ip, '/', 'UserName') cookiejar.clear(self._ip, '/', 'Password') return (opener, body)
def build_fetcher_builder(log, client_number): """ Generates a function that generates a function that takes a url, reports statistics, and returns a soup. TODO: Some of this is routine-dependant. Refactor. """ # This logic happens once per client opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(CookieJar())) def build_fetcher(routine_name): # This logic happends once per routine def fetcher(url, params=None, step=""): # This logic happends once per page load try: time_before = time.time() response_text = opener.open(routine.base_url + url, params).read() time_after = time.time() soup = BeautifulSoup(response_text) if response_text is not None: reported_time = response_text[:10].split("|")[0][:-2] log([time_before, time_after, reported_time, client_number, current_processes, url,routine_name, step]) return soup else: raise Exception() except Exception: log([time_before, time_after, "", client_number, current_processes, url,routine_name, step]) return None return fetcher return build_fetcher
def __init__ (self, manager, useCookies=False): super(URLFetchWorker, self).__init__(manager) self.timeoutTask = 0 if useCookies: self.setCookies() else: self.opener = urllib2.build_opener()
def handle404(self, reqorig, url, container, obj): """ Return a webob.Response which fetches the thumbnail from the thumb host, potentially writes it out to Swift so we don't 404 next time, and returns it. Note also that the thumb host might write it out to Swift so we don't have to. """ # go to the thumb media store for unknown files reqorig.host = self.thumbhost # upload doesn't like our User-agent, otherwise we could call it # using urllib2.url() opener = urllib2.build_opener() opener.addheaders = [('User-agent', self.user_agent)] # At least in theory, we shouldn't be handing out links to originals # that we don't have (or in the case of thumbs, can't generate). # However, someone may have a formerly valid link to a file, so we # should do them the favor of giving them a 404. try: upcopy = opener.open(reqorig.url) except urllib2.HTTPError, status: if status == 404: resp = webob.exc.HTTPNotFound( 'Expected original file not found') return resp else: resp = webob.exc.HTTPNotFound('Unexpected error %s' % status) return resp
def get_plugin_status(args): plugin, host, request = args url = "%s/plugins/%s/%d/_s/status" % ( host, plugin.plugin_name, plugin.id) json = None jail_status = notifier().pluginjail_running(pjail=plugin.plugin_jail) if not jail_status: return plugin, json, jail_status try: opener = urllib2.build_opener() opener.addheaders = [ ('Cookie', 'sessionid=%s' % ( request.COOKIES.get("sessionid", ''), )) ] #TODO: Increase timeout based on number of plugins response = opener.open(url, None, 5).read() json = simplejson.loads(response) except Exception, e: log.warn(_("Couldn't retrieve %(url)s: %(error)s") % { 'url': url, 'error': e, })
def base_request(self, method, container=None, name=None, prefix=None, headers={}, proxy=None, contents=None, full_listing=None): # Common request method url = self.url if self.token: headers['X-Auth-Token'] = self.token if container: url = '%s/%s' % (url.rstrip('/'), quote(container)) if name: url = '%s/%s' % (url.rstrip('/'), quote(name)) url += '?format=json' if prefix: url += '&prefix=%s' % prefix if proxy: proxy = urlparse.urlparse(proxy) proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers, data=contents) req.get_method = lambda: method urllib2.urlopen(req) conn = urllib2.urlopen(req) body = conn.read() try: body_data = json.loads(body) except ValueError: body_data = None return [None, body_data]
def probeModel(self): '''Probe specific model of the Hanlong phone To probe for the specific model, a http session is tried. After authentication, the status page reveals the phone model. ''' sModel = None # Try detecting Hanlong with updated firmware try: password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(None, 'http://' + self._ip + '/', 'admin', 'admin') basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) opener = urllib2.build_opener(basic_auth_handler) response = opener.open('http://' + self._ip + '/') htmlbody = response.read() # <TR> # <td width="220"><script> document.write(jscs.product_type);</script></td> # <td width="250">UC862</td> # <TR> m = m = re.search(r'product_type\);</script></TD>.*?<TD.*?>(\w+)', htmlbody, re.IGNORECASE | re.DOTALL) if m != None: sModel = m.group(1) except Exception, e: pass
def __init__(self, concurrency=10): # a green pool is a pool of greenthreads - you're pushing # tasks to it and they get executed when eventlet's loop is # active self.pool = eventlet.GreenPool(concurrency) # the queue receives URLs to visit self.queue = eventlet.Queue() # our root URL, the first to be fetched self.queue.put("https://play.google.com/store/apps") # after a fetch of an app is finished, results get pushed in # this queue self.results = eventlet.Queue() # we need to make sure we don't fetch the same URL more than # once, otherwise the script might never finish self.seen = set() # `seen_app_ids` cuts down on fetching apps that have been # fetched before; it is necessary in addition to `seen` self.seen_app_ids = set() # just a counter for statistics self.failed = 0 self.cnt = 0 # our opener self.browser = urllib2.build_opener() self.browser.addheaders.append(('Cookie', 'hlSession2=en'))
def _enableStaticProvisioning_GXP1450(self, vars): try: # Login into interface opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open( 'http://' + self._ip + '/cgi-bin/dologin', urllib.urlencode({ 'Login': '******', 'P2': self._http_password, 'gnkey': '0b82' })) body = response.read() if 'dologin' in body: logging.error('Endpoint %s@%s GXP1450 - dologin failed login' % (self._vendorname, self._ip)) return False response = opener.open('http://' + self._ip + '/cgi-bin/update', urllib.urlencode(vars) + '&gnkey=0b82') body = response.read() if 'dologin' in body: logging.error( 'Endpoint %s@%s GXP1450 - dologin failed to keep session' % (self._vendorname, self._ip)) return False return True except socket.error, e: logging.error('Endpoint %s@%s GXP1450 failed to connect - %s' % (self._vendorname, self._ip, str(e))) return False
def main(self, start_url, block_extensions=['.pdf','.gif','.jpg','.JPG','.PNG','.png','.wav','.mp3','.wma'], max_urls = 100): # Set user agent string opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'), #('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en,en-zh;q=0.8'), #('Cache-Control', 'max-age=0'), #('Connection', 'keep-alive') ] urllib2.install_opener(opener) # Get base info (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url) fragments = (scheme, netloc, '', '', '', '') base_url = urlparse.urlunparse(fragments) #print "base_url -> ", base_url mainLink = LinkInfo(None,base_url,u'Main',0,u'first page') self.assignID(mainLink) urls_queue = set([mainLink]) urls_crawled = set() urls_crawled2 = set() pool = eventlet.GreenPool(20) counter = 0 tmpC = 0 while True: #Infinite loop sanity check counter +=1 if counter > max_urls: break for url, body in pool.imap(self.fetch, urls_queue): # Remove this url from the queue set urls_queue = urls_queue - set([url]) # Add url to crawled set urls_crawled = urls_crawled.union(set([url])) urls_crawled2 = urls_crawled2.union(set([url])) # Extract links links = self.extract_links(url, body, block_extensions) if ( links == None ):return urls_crawled if tmpC == 100000 : return urls_crawled tmpC += 1 for link in links: if link not in urls_queue and link not in urls_crawled: # Add link to queue urls_queue = urls_queue.union(set([link])) print u"[valid]: link -> ", link.link return urls_crawled
def _install_opener(self): if has_valid_attr(self.settings,'PROXY_HOST') and has_valid_attr(self.settings,'PROXY_PORT'): proxy_info = { #proxy information 'user' : getattr(self.settings, 'PROXY_USER', ''), 'pass' : getattr(self.settings, 'PROXY_PASS', ''), 'host' : getattr(self.settings, 'PROXY_HOST', ''), #localhost 'port' : getattr(self.settings, 'PROXY_PORT', 80) } # build a new opener that uses a proxy requiring authorization proxy = urllib2.ProxyHandler({"http" :"http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info}) self.opener = urllib2.build_opener(proxy, self.cookie_handler) else: self.opener = urllib2.build_opener(self.cookie_handler)
def vector_sort(wiki_titles, terms_counts_dict, verbose=False): # wiki_titles is a list of wiki titles # terms_counts_dict is a dictionary with term tuples keys and counts values wiki = wikiapi.WikiApi() opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] def title_to_article_url(title): spaces_to_underscores = '_'.join(title.split()) utf8_encoded_title = spaces_to_underscores.encode('utf8') url_title = urllib2.quote(utf8_encoded_title) # url escape article_url = wiki.get_article_url(url_title) return article_url def fetch_content(wiki_title): # takes a wiki title and gets the page content # returns a tuple of the wiki title and the cosine value content = opener.open(title_to_article_url(wiki_title)).read() return wiki_title, content term_array = np.array(terms_counts_dict.values()) pool = eventlet.GreenPool(250) # some fail if more than 250 titles_cosines_dict = {} i = 1 leng = len(wiki_titles) for wiki_title, content in pool.imap(fetch_content, wiki_titles[:100]): if verbose: print wiki_title if len(content) < 10: assert False if verbose: print "tokenizing..." tokens = tokenize_article_content(wiki.get_article(content).content) if verbose: print "counting ngrams for tokens..." tokens_counter = Counter(ngram for sent in tokens for i in xrange(1, 6) for ngram in ngrams(sent, i)) if verbose: print "calculating array..." wiki_array = np.array([tokens_counter[term]*math.log(len(term)+1,2)\ for term in terms_counts_dict.iterkeys()]) #*len(term)*len(term) full_wiki_array = np.array(tokens_counter.values()) if verbose: print "calculating cosine..." cosine_value = np.dot(term_array,wiki_array)/ \ (np.linalg.norm(term_array) * np.linalg.norm(full_wiki_array)) titles_cosines_dict[wiki_title] = cosine_value if verbose: print cosine_value if verbose: print i, '/', leng i += 1 if verbose: print titles_cosines_dict if verbose: print sorted_titles_cosines = sorted(titles_cosines_dict, key=titles_cosines_dict.get, reverse=True) if verbose: print sorted_titles_cosines return sorted_titles_cosines
def vector_sort(wiki_titles, terms_counts_dict, verbose=False): # wiki_titles is a list of wiki titles # terms_counts_dict is a dictionary with term tuples keys and counts values wiki = wikiapi.WikiApi() opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] def title_to_article_url(title): spaces_to_underscores = '_'.join(title.split()) utf8_encoded_title = spaces_to_underscores.encode('utf8') url_title = urllib2.quote(utf8_encoded_title) # url escape article_url = wiki.get_article_url(url_title) return article_url def fetch_content(wiki_title): # takes a wiki title and gets the page content # returns a tuple of the wiki title and the cosine value content = opener.open(title_to_article_url(wiki_title)).read() return wiki_title, content term_array = np.array(terms_counts_dict.values()) pool = eventlet.GreenPool(250) # some fail if more than 250 titles_cosines_dict = {} i = 1 leng = len(wiki_titles) for wiki_title, content in pool.imap(fetch_content,wiki_titles[:100]): if verbose: print wiki_title if len(content) < 10: assert False if verbose: print "tokenizing..." tokens = tokenize_article_content(wiki.get_article(content).content) if verbose: print "counting ngrams for tokens..." tokens_counter = Counter(ngram for sent in tokens for i in xrange(1,6) for ngram in ngrams(sent,i)) if verbose: print "calculating array..." wiki_array = np.array([tokens_counter[term]*math.log(len(term)+1,2)\ for term in terms_counts_dict.iterkeys()]) #*len(term)*len(term) full_wiki_array = np.array(tokens_counter.values()) if verbose: print "calculating cosine..." cosine_value = np.dot(term_array,wiki_array)/ \ (np.linalg.norm(term_array) * np.linalg.norm(full_wiki_array)) titles_cosines_dict[wiki_title] = cosine_value if verbose: print cosine_value if verbose: print i, '/', leng i += 1 if verbose: print titles_cosines_dict if verbose: print sorted_titles_cosines = sorted(titles_cosines_dict, key=titles_cosines_dict.get, reverse = True) if verbose: print sorted_titles_cosines return sorted_titles_cosines
def _enableStaticProvisioning_GXP140x(self, vars): try: # Login into interface and get SID. Check proper Content-Type cookiejar = cookielib.CookieJar( cookielib.DefaultCookiePolicy(rfc2965=True)) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookiejar)) # response = urllib2.urlopen('http://' + self._ip + '/cgi-bin/dologin', response = opener.open( 'http://' + self._ip + '/cgi-bin/dologin', urllib.urlencode({'password': self._http_password})) body = response.read() content_type = response.info()['Content-Type'].rsplit(';', 1)[0] if content_type <> 'application/json': logging.error( 'Endpoint %s@%s GXP140x - dologin answered not application/json but %s' % (self._vendorname, self._ip, response.info()['Content-Type'])) return False # Check successful login and get sid jsonvars = cjson.decode(body) if not ('body' in jsonvars and 'sid' in jsonvars['body']): logging.error('Endpoint %s@%s GXP140x - dologin failed login' % (self._vendorname, self._ip)) return False sid = jsonvars['body']['sid'] # Post vars with sid vars.update({'sid': sid}) # response = urllib2.urlopen('http://' + self._ip + '/cgi-bin/api.values.post', response = opener.open( 'http://' + self._ip + '/cgi-bin/api.values.post', urllib.urlencode(vars)) jsonvars = self._parseBotchedJSONResponse(response) if jsonvars == None: logging.error( 'jsonvars vacio %s@%s GXP140x - vars rejected by interface - %s - %s - %s' % (self._vendorname, self._ip, urllib.urlencode(vars), jsonvars['body'], sid)) return False if not ('response' in jsonvars and jsonvars['response'] == 'success' \ and 'body' in jsonvars and 'status' in jsonvars['body'] and jsonvars['body']['status'] == 'right' ): logging.error( 'Endpoint %s@%s GXP140x - vars rejected by interface - %s - %s - %s' % (self._vendorname, self._ip, urllib.urlencode(vars), jsonvars['body'], sid)) return False return True except cjson.DecodeError, e: logging.error('Endpoint %s@%s GXP140x received invalid JSON - %s' % (self._vendorname, self._ip, str(e))) return False
def build_urllib2_opener(config): kwargs = {'key_file': config.ssl_client_key_path, 'cert_file': config.ssl_client_cert_path, 'ca_cert_file': config.ssl_ca_cert_path, 'verify_host': bool(config.ssl_validation), # None -> False 'proxy_url': config.proxy_url, 'proxy_port': config.proxy_port,} handler = PulpHandler(**kwargs) return urllib2.build_opener(handler)
def setUp(self): FakeProxyHandler.digest_auth_handler.set_users( {self.USER: self.PASSWD}) FakeProxyHandler.digest_auth_handler.set_realm(self.REALM) self.server = LoopbackHttpServerThread(self.PORT, FakeProxyHandler) self.server.start() self.server.ready.wait() handler = urllib2.ProxyHandler({"http": self.PROXY_URL}) self._digest_auth_handler = urllib2.ProxyDigestAuthHandler() self.opener = urllib2.build_opener(handler, self._digest_auth_handler)
def _install_opener(self): if has_valid_attr(self.settings, 'PROXY_HOST') and has_valid_attr( self.settings, 'PROXY_PORT'): proxy_info = { #proxy information 'user': getattr(self.settings, 'PROXY_USER', ''), 'pass': getattr(self.settings, 'PROXY_PASS', ''), 'host': getattr(self.settings, 'PROXY_HOST', ''), #localhost 'port': getattr(self.settings, 'PROXY_PORT', 80) } # build a new opener that uses a proxy requiring authorization proxy = urllib2.ProxyHandler({ "http": "http://%(user)s:%(pass)s@%(host)s:%(port)d" % proxy_info }) self.opener = urllib2.build_opener(proxy, self.cookie_handler) else: self.opener = urllib2.build_opener(self.cookie_handler)
def _rebootbyhttp(self): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open('http://' + self._ip + '/cgi-bin//api-sys_operation?passcode=' + self._http_password + '&request=REBOOT') jsonvars = self._parseBotchedJSONResponse(response) if jsonvars == None: return False if not ('response' in jsonvars and jsonvars['response'] == 'success'): logging.error('Endpoint %s@%s unimplemented reboot by HTTP' % (self._vendorname, self._ip)) return False return True
def setUp(self): FakeProxyHandler.digest_auth_handler.set_users({ self.USER : self.PASSWD }) FakeProxyHandler.digest_auth_handler.set_realm(self.REALM) self.server = LoopbackHttpServerThread(self.PORT, FakeProxyHandler) self.server.start() self.server.ready.wait() handler = urllib2.ProxyHandler({"http" : self.PROXY_URL}) self._digest_auth_handler = urllib2.ProxyDigestAuthHandler() self.opener = urllib2.build_opener(handler, self._digest_auth_handler)
def pull(title): """pull all the infobox goodies from wikipedia""" url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')] try: html = opener.open(url.encode("utf-8")).read() except: print(u" Could not fetch %s" % url).encode('utf-8') return None try: soup = BeautifulSoup.BeautifulSoup(html) except: print(u" Could not parse %s" % url).encode('utf-8') return None # Extract information infobox = soup.find("table", {'class': re.compile(r'\binfobox\b')}) if not infobox: print(u" No infobox found in %s" % url).encode('utf-8') return None information = {} name = infobox.find("th", {'class': 'fn org'}) if name: information['name'] = extract_text(name) def grab(info, name=None): if name is None: name = info.lower() text = infobox.find("text", text=info) if text: information[name] = extract_text(text.parent.findNext("td")) grab("Capital") grab("Admission to Union", "admitted") pop = infobox.find("text", text="Population") if pop: text = pop.findNext("text", text=re.compile("Total$")) if text: information['population'] = extract_text( text.parent.findNext("td")) grab(re.compile("Latitude$"), "latitude") grab(re.compile("Longitude$"), "longitude") text = infobox.find("text", text=re.compile("Motto")) if text: information["motto"] = extract_text(text.findNext("i")) information["description"] = extract_text(infobox.findNext("p")) return information
def get_class(patent_id, cluster_id): url = BASE_URL + fix_patent_number(patent_id) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: html = opener.open(url).read() soup = bs(html) toptd = soup.find('td', text='U.S. Classification') classstring = toptd.findNext('td').findNext('span').text mainclass, subclass = classstring.split('/') return patent_id, cluster_id, mainclass, subclass except: print 'Couldnt get', patent_id return ('', '', '', '')
def openAnything(source, referrer=None, etag=None, lastmodified=None, agent=USER_AGENT): """URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. If the etag argument is supplied, it will be used as the value of an If-None-Match request header. If the lastmodified argument is supplied, it must be a formatted date/time string in GMT (as returned in the Last-Modified header of a previous request). The formatted date/time will be used as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. """ if hasattr(source, 'read'): return source if source == '-': return sys.stdin if urlparse.urlparse(source)[0] == 'http': # open URL with urllib2 request = urllib2.Request(source) request.add_header('User-Agent', agent) if referrer: print "Adding referrer %s" % referrer request.add_header('Referer', referrer) if lastmodified: request.add_header('If-Modified-Since', lastmodified) if etag: request.add_header('If-None-Match', etag) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler()) return opener.open(request) # try to open with native open function (if source is a filename) try: return open(source) except (IOError, OSError): pass # treat source as string return StringIO(str(source))
def get_class(patent_id, cluster_id): url = BASE_URL + fix_patent_number(patent_id) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] try: html = opener.open(url).read() soup = bs(html) toptd = soup.find('td',text='U.S. Classification') classstring = toptd.findNext('td').findNext('span').text mainclass, subclass = classstring.split('/') return patent_id, cluster_id, mainclass, subclass except: print 'Couldnt get', patent_id return ('', '', '', '')
def probeModel(self): '''Probe specific model of Aastra phone The Aastra web admin interface uses Basic authentication for access control. The authentication realm exposes the phone model like this: HTTP/1.1 401 Unauthorized Server: Aragorn WWW-Authenticate: Basic realm="Aastra 6757i" Connection: close Content-Length: 745 Content-Type: text/html ''' sModel = None try: # Do not expect this to succeed. Only interested in exception. urllib2.urlopen('http://' + self._ip + '/') except urllib2.HTTPError, e: if e.code == 401 and 'WWW-Authenticate' in e.headers: m = re.search(r'realm="Aastra (.+)"', e.headers['WWW-Authenticate']) if m != None: sModel = m.group(1) else: self._http_username = '******' self._http_password = '******' password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm( ) password_manager.add_password(None, 'http://' + self._ip + '/', self._http_username, self._http_password) basic_auth_handler = urllib2.HTTPBasicAuthHandler( password_manager) opener = urllib2.build_opener(basic_auth_handler) try: response = opener.open('http://' + self._ip + '/sysinfo.html') htmlbody = response.read() # <TR> # <TD style="BORDER-BOTTOM: 1px dashed">Platform</TD> # <TD style="BORDER-BOTTOM: 1px dashed">9112i Revision 0</TD></TR> # <TR> m = re.search(r'Platform</TD>.*?<TD.*?>(\w+)', htmlbody, re.IGNORECASE | re.DOTALL) if m != None: sModel = m.group(1) except Exception, e: pass
def pull(title): """pull all the infobox goodies from wikipedia""" url = "http://en.wikipedia.org/w/index.php?action=render&title=%s" % title opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (user-agent-restrictions-are-silly)')] try: html = opener.open(url.encode("utf-8")).read() except: print (u" Could not fetch %s" % url).encode('utf-8') return None try: soup = BeautifulSoup.BeautifulSoup(html) except: print (u" Could not parse %s" % url).encode('utf-8') return None # Extract information infobox = soup.find("table", { 'class': re.compile(r'\binfobox\b') }) if not infobox: print (u" No infobox found in %s" % url).encode('utf-8') return None information = {} name = infobox.find("th", { 'class': 'fn org' }) if name: information['name'] = extract_text(name) def grab(info, name=None): if name is None: name = info.lower() text = infobox.find("text", text=info) if text: information[name] = extract_text(text.parent.findNext("td")) grab("Capital") grab("Admission to Union", "admitted") pop = infobox.find("text", text="Population") if pop: text = pop.findNext("text", text=re.compile("Total$")) if text: information['population'] = extract_text(text.parent.findNext("td")) grab(re.compile("Latitude$"), "latitude") grab(re.compile("Longitude$"), "longitude") text = infobox.find("text", text=re.compile("Motto")) if text: information["motto"] = extract_text(text.findNext("i")) information["description"] = extract_text(infobox.findNext("p")) return information
def fetchFiles(name, url): if not exists(name): time.sleep(random.random() * wait) opener = urllib2.build_opener() opener.addheaders = [ ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Encoding', 'gzip, deflate'), ('Connection', 'close'), ('Proxy-Authorization', randomHexString()), ('User-agent', randomUA()) ] r = opener.open(url) open(name, 'wb').write(r.read()) else: stderr.write(name + ' already exists [' + url + ']\n') return name
def _request(self, statement, timeout=0): """ Builds the query string, then opens a connection to the endpoint and returns the file descriptor. """ query = self._queryString(statement) buf = tempfile.NamedTemporaryFile() opener = urllib2.build_opener(RedirectHandler) opener.addheaders = self.headers().items() try: response = self._build_response(query, opener, buf, timeout) except SparqlException, error: self.endpoint = error.message response = self._build_response(query, opener, buf, timeout)
def main(self, start_url, block_extensions=['.pdf'], max_urls = 100): # Set user agent string opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'), ('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en;q=0.8'), ('Cache-Control', 'max-age=0'), ('Connection', 'keep-alive') ] urllib2.install_opener(opener) # Get base info (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url) fragments = (scheme, netloc, '', '', '', '') base_url = urlparse.urlunparse(fragments) urls_queue = set([base_url]) urls_crawled = set() pool = eventlet.GreenPool(20) counter = 0 while True: #Infinite loop sanity check counter +=1 if counter > max_urls: break for url, body in pool.imap(self.fetch, urls_queue): # Remove this url from the queue set urls_queue = urls_queue - set([url]) # Add url to crawled set urls_crawled = urls_crawled.union(set([url])) # Extract links links = self.extract_links(url, body, block_extensions) for link in links: if link not in urls_queue and link not in urls_crawled: # Add link to queue urls_queue = urls_queue.union(set([link])) return urls_crawled
def request(self, host, handler, request_body, verbose): '''Send xml-rpc request using proxy''' # We get a traceback if we don't have this attribute: self.verbose = verbose url = 'http://' + host + handler request = urllib2.Request(url) request.add_data(request_body) # Note: 'Host' and 'Content-Length' are added automatically base64string = base64.encodestring( '%s:%s' % (self._username, self._password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) request.add_header('User-Agent', self.user_agent) request.add_header('Content-Type', 'text/xml') proxy_handler = urllib2.ProxyHandler() opener = urllib2.build_opener(proxy_handler) fhandle = opener.open(request) return(self.parse_response(fhandle))
def request(self, host, handler, request_body, verbose): '''Send xml-rpc request using proxy''' # We get a traceback if we don't have this attribute: self.verbose = verbose url = 'http://' + host + handler request = urllib2.Request(url) request.add_data(request_body) # Note: 'Host' and 'Content-Length' are added automatically base64string = base64.encodestring( '%s:%s' % (self._username, self._password)).replace('\n', '') request.add_header("Authorization", "Basic %s" % base64string) request.add_header('User-Agent', self.user_agent) request.add_header('Content-Type', 'text/xml') proxy_handler = urllib2.ProxyHandler() opener = urllib2.build_opener(proxy_handler) fhandle = opener.open(request) return (self.parse_response(fhandle))
def coordinate_solver(id): global coordinates global coordinates_id global commune global departement mycid = 0 s = eventlet.semaphore.Semaphore(1) cj = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) body = opener.open("http://www.cadastre.gouv.fr/scpc/accueil.do") page = body.read() data = {'ville': commune, 'codeDepartement': departement} body = opener.open("http://www.cadastre.gouv.fr/scpc/rechercherPlan.do", urllib.urlencode(data)) page = body.read() m = re.search("afficherCarteCommune.do.c=([^']+)'", page) codeCommune = None if m: codeCommune = m.group(1) else: m = re.search('select name="codeCommune" id="codeCommune" class="long erreur"><option value="">Choisir</option><option value="([^"]+)"', page) if m: codeCommune = m.group(1) data = {"codeCommune": codeCommune, 'codeDepartement': departement, 'nbResultatParPage': 10, 'x':153, 'y':6} body = opener.open("http://www.cadastre.gouv.fr/scpc/rechercherPlan.do", urllib.urlencode(data)) page = body.read() if codeCommune == None: print "commune: ERROR" return "ERROR" body = opener.open("http://www.cadastre.gouv.fr/scpc/afficherCarteCommune.do?c="+codeCommune) page = body.read() if re.search("Impossible d'initialiser", page): print "carte: ERROR" return "ERROR" while True: with s: mycid = coordinates_id coordinates_id += 1 if mycid >= len(coordinates): break coordargs = coordinates[mycid] m = re.search('^([^ ]+) ([^ ]+) ([^ ]+)$', coordargs) if coordinate_xml_solver(opener, codeCommune, m.group(2), m.group(1)) == False: print "New tests for coords" coordinate_xml_solver(opener, codeCommune, m.group(3), m.group(1))
def create_objects(data): # TODO: handle errors url = data.get('url') token = data.get('token') delete_at = data.get('delete_at') opener = urllib2.build_opener(urllib2.HTTPHandler) request = urllib2.Request(url, data='') request.add_header('Content-Type', 'text/plain') request.add_header('X-Auth-Token', token) request.add_header('X-Delete-At', delete_at) request.get_method = lambda: 'PUT' result = opener.open(request) txid = result.headers.getheader('X-Trans-Id') status = result.getcode() return url, txid, status
def get_response(self, data=None): """ Returns the response object from a request. Cookies are supported via a CookieHandler object """ self._normalize_url() request = urllib2.Request(self.url, data, self.headers) opener = urllib2.build_opener(self.cookie_handler) if REQUEST_TIMEOUT is not None: response = opener.open(request, timeout=REQUEST_TIMEOUT) else: response = opener.open(request) self.cookie_handler.save_cookies() return response
def openurl(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36', 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4', 'Accept-Encoding': 'gzip,deflate', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Cache-Control': 'max-age=0' } opener = urllib2.build_opener() opener.addheaders = headers.items() response = opener.open(ext.iriToUri(url)) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(response.read()) response = gzip.GzipFile(fileobj=buf) page = response.read() return page
def _request(self, statement, timeout=0): """ Builds the query string, then opens a connection to the endpoint and returns the file descriptor. """ query = self._queryString(statement) buf = tempfile.NamedTemporaryFile() opener = urllib2.build_opener() opener.addheaders = self.headers().items() request = self._build_request(query) response = self._get_response(opener, request, buf, timeout if timeout > 0 else None) self._read_response(response, buf, timeout) buf.seek(0) return buf
def get_wiki_content(title): # title is in unicode (utf-8) format with spaces, without underscores and # url escape characters wiki = wikiapi.WikiApi() spaces_to_underscores = '_'.join(title.split()) utf8_encoded_title = spaces_to_underscores.encode('utf8') url_title = urllib2.quote(utf8_encoded_title) # url escape article_url = wiki.get_article_url(url_title) # print repr(article_url) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] content = opener.open(article_url).read() art = wiki.get_article(content) # print "Got article: ", art.heading # print "Content: ", art.content # print return art.content
def handle404(self, reqorig, url, container, obj): """ Return a webob.Response which fetches the thumbnail from the thumb host, potentially writes it out to Swift so we don't 404 next time, and returns it. Note also that the thumb host might write it out to Swift so we don't have to. """ # go to the thumb media store for unknown files reqorig.host = self.thumbhost # upload doesn't like our User-agent, otherwise we could call it # using urllib2.url() opener = urllib2.build_opener() # Pass on certain headers from the caller squid to the scalers opener.addheaders = [] if reqorig.headers.get('User-Agent') != None: opener.addheaders.append(('User-Agent', reqorig.headers.get('User-Agent'))) else: opener.addheaders.append(('User-Agent', self.user_agent)) for header_to_pass in ['X-Forwarded-For', 'X-Original-URI']: if reqorig.headers.get( header_to_pass ) != None: opener.addheaders.append((header_to_pass, reqorig.headers.get( header_to_pass ))) # At least in theory, we shouldn't be handing out links to originals # that we don't have (or in the case of thumbs, can't generate). # However, someone may have a formerly valid link to a file, so we # should do them the favor of giving them a 404. try: # break apach the url, url-encode it, and put it back together urlobj = list(urlparse.urlsplit(reqorig.url)) urlobj[2] = urllib2.quote(urlobj[2], '%/') encodedurl = urlparse.urlunsplit(urlobj) # ok, call the encoded url upcopy = opener.open(encodedurl) except urllib2.HTTPError,status: if status.code == 404: resp = webob.exc.HTTPNotFound('Expected original file not found') return resp else: resp = webob.exc.HTTPNotFound('Unexpected error %s' % status) resp.status = status.code return resp
def _request(self, statement, timeout=0): """ Builds the query string, then opens a connection to the endpoint and returns the file descriptor. """ resultsType = 'xml' query = self._queryString(statement) buf = tempfile.NamedTemporaryFile() opener = urllib2.build_opener() opener.addheaders = self.headers().items() request = self._build_request(query) response = self._get_response(opener, request, buf) self._read_response(response, buf, timeout) buf.seek(0) return buf
def _request(self, statement, timeout=0): """ Builds the query string, then opens a connection to the endpoint and returns the file descriptor. """ query = self._queryString(statement) buf = tempfile.NamedTemporaryFile() opener = ev_request.build_opener(RedirectHandler) opener.addheaders = list(self.headers().items()) try: if type(query) is not bytes and not six.PY2: query = query.encode() response = self._build_response(query, opener, buf, timeout) except SparqlException as error: self.endpoint = error.message response = self._build_response(query, opener, buf, timeout) self._read_response(response, buf, timeout) buf.seek(0) return buf
def proxyTest(self, row): proxy = row[0] + ":" + row[1] if 'HTTPS' in row[3]: proxies = {"https": "https://" + proxy} else: proxies = {"http": "http://" + proxy} ip = row[0] port = row[1] theProxy = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(theProxy) urllib2.install_opener(opener) testResult = 'ok!' try: webcode = urllib2.urlopen("https://www.fliggy.com/", timeout=10).getcode() #logger.info("Proxy %s is ok" % proxy) except Exception, e: #logger.warn("Proxy %s is nolonger ok" % proxy) self.clean(ip=ip, port=port) testResult = 'nolonger ok!'
def _doAuthPost(self, urlpath, postvars): '''Perform an HTTP POST on a particular URL using the HTTP credentials This method is frequently used to make the phone use the Elastix server as the TFTP source for autoprovisioning. ''' password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(None, 'http://' + self._ip + '/', self._http_username, self._http_password) basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) digest_auth_handler = urllib2.HTTPDigestAuthHandler(password_manager) opener = urllib2.build_opener(basic_auth_handler, digest_auth_handler) if postvars != None: opener.addheaders = [('Content-Type', 'application/x-www-form-urlencoded')] if not isinstance(postvars, str): postvars = urllib.urlencode(postvars) try: opener.open('http://' + self._ip + urlpath, postvars) except urllib2.HTTPError, e: logging.error('Endpoint %s@%s failed to authenticate - %s' % (self._vendorname, self._ip, str(e))) return False
def get_sync(self, url, data, type=None, content_type="application/x-www-form-urlencoded"): try: data = urlencode(data) except: # data is probably a string to be send directly. pass headers = {"Content-Type" : content_type} if type and type.upper() not in ("POST", "GET"): from restlib import RestfulRequest #@UnresolvedImport req = RestfulRequest(url, data=data, method=type.upper()) else: req = urllib2.Request(url, data, headers=headers) opener = urllib2.build_opener(self) try: f = opener.open(req, data=data) if f.code is None or str(f.code)[0] == "2": dispatcher.send(UrlGetter.HTTP_RESULT, self, result=f.read(), source=url, code=f.code) else: e = urllib2.HTTPError(url, f.code, "A code %s HTTP error has ocurred when trying to send to target %s" % (f.code, url), req.headers, f) dispatcher.send(UrlGetter.HTTP_ERROR, self, exception=e) except urllib2.URLError, e: dispatcher.send(UrlGetter.URL_ERROR, self, exception=e, url=url)
def _enableStaticProvisioning_BT200(self, vars): try: # Login into interface cookiejar = cookielib.CookieJar( cookielib.DefaultCookiePolicy(rfc2965=True)) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookiejar)) response = opener.open( 'http://' + self._ip + '/dologin.htm', urllib.urlencode({ 'Login': '******', 'P2': self._http_password, 'gnkey': '0b82' })) body = response.read() if 'dologin.htm' in body: logging.error('Endpoint %s@%s BT200 - dologin failed login' % (self._vendorname, self._ip)) return False # Force cookie version to 0 for cookie in cookiejar: cookie.version = 0 response = opener.open('http://' + self._ip + '/update.htm', urllib.urlencode(vars) + '&gnkey=0b82') body = response.read() if 'dologin.htm' in body: logging.error( 'Endpoint %s@%s BT200 - dologin failed to keep session' % (self._vendorname, self._ip)) return False return True except urllib2.HTTPError, e: logging.error( 'Endpoint %s@%s BT200 failed to send vars to interface - %s' % (self._vendorname, self._ip, str(e))) return False
def setCookies(self): cookie_file_path = os.path.join(os.environ['LOCALAPPDATA'], r'Google\Chrome\User Data\Default\Cookies') if not os.path.exists(cookie_file_path): raise Exception('Cookies file not exist!') #fetch domain from website, domain = "" tmpList = re.compile(r'http://(.*?)/').findall(self.manager.webSite) if tmpList: domain = tmpList[0] else: #webSite is a root domain = re.compile(r'http://(.*?)').findall(self.manager.webSite) sql = 'select host_key, name, encrypted_value, path from cookies' sql += ' where host_key like "%{}%"'.format(domain) with sqlite3.connect(cookie_file_path) as conn: rows = conn.execute(sql) cookiejar = cookielib.CookieJar() for row in rows: #get encrypted value pwdHash = str(row[2]) try: ret = win32crypt.CryptUnprotectData(pwdHash, None, None, None, 0) except: print 'Fail to decrypt chrome cookies' sys.exit(-1) cookie_item = cookielib.Cookie(version=0, name=row[1], value=ret[1], port=None, port_specified=None,domain=row[0], domain_specified=None, domain_initial_dot=None,path=row[3], path_specified=None,secure=None,expires=None, discard=None,comment=None,comment_url=None,rest=None,rfc2109=False, ) cookiejar.set_cookie(cookie_item) # Apply each cookie_item to cookiejar self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
def probeModel(self): '''Probe specific model of the Snom phone The Snom phone displays the phone model in the title screen, which is unsecured by default. ''' self._loadCustomCredentials() sModel = None try: password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() if self._http_password != None: password_manager.add_password(None, 'http://' + self._ip + '/', self._http_username, self._http_password) basic_auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) opener = urllib2.build_opener(basic_auth_handler) response = opener.open('http://' + self._ip + '/') htmlbody = response.read() if response.code == 200: # <TITLE>snom 320</TITLE> m = re.search(r'<TITLE>snom (\w+)</TITLE>', htmlbody, re.IGNORECASE) if m != None: sModel = m.group(1) else: # M300, M700 m = re.search(r'<TITLE>(M\d+)</TITLE>', htmlbody, re.IGNORECASE) if m != None: sModel = m.group(1) #except urllib2.HTTPError, e: # if e.code == 401 and 'WWW-Authenticate' in e.headers: # m = re.search(r'realm="Aastra (.+)"', e.headers['WWW-Authenticate']) # if m != None: sModel = m.group(1) except Exception, e: pass
def __init__(self, concurrency=10): # a green pool is a pool of greenthreads - you're pushing # tasks to it and they get executed when eventlet's loop is # active self.pool = eventlet.GreenPool(concurrency) # the queue receives URLs to visit self.queue = eventlet.Queue() # our root URL, the first to be fetched self.queue.put("https://market.android.com/") # after a fetch of an app is finished, results get pushed in # this queue self.results = eventlet.Queue() # we need to make sure we don't fetch the same URL more than # once, otherwise the script might never finish self.seen = set() # `seen_app_ids` cuts down on fetching apps that have been # fetched before; it is necessary in addition to `seen` self.seen_app_ids = set() # just a counter for statistics self.failed = 0 # our opener self.browser = urllib2.build_opener() self.browser.addheaders.append(('Cookie', 'hlSession2=en'))
def handle404(self, reqorig, url, container, obj): """ Return a swob.Response which fetches the thumbnail from the thumb host and returns it. Note also that the thumb host might write it out to Swift so it won't 404 next time. """ # upload doesn't like our User-agent, otherwise we could call it # using urllib2.url() thumbor_opener = urllib2.build_opener(DumbRedirectHandler()) # Pass on certain headers from Varnish to Thumbor thumbor_opener.addheaders = [] if reqorig.headers.get('User-Agent') is not None: thumbor_opener.addheaders.append( ('User-Agent', reqorig.headers.get('User-Agent'))) else: thumbor_opener.addheaders.append(('User-Agent', self.user_agent)) for header_to_pass in [ 'X-Forwarded-For', 'X-Forwarded-Proto', 'Accept', 'Accept-Encoding', 'X-Original-URI' ]: if reqorig.headers.get(header_to_pass) is not None: header = (header_to_pass, reqorig.headers.get(header_to_pass)) thumbor_opener.addheaders.append(header) # At least in theory, we shouldn't be handing out links to originals # that we don't have (or in the case of thumbs, can't generate). # However, someone may have a formerly valid link to a file, so we # should do them the favor of giving them a 404. try: thumbor_encodedurl = self.thumborify_url(reqorig, self.thumborhost) upcopy = thumbor_opener.open(thumbor_encodedurl) except urllib2.HTTPError as error: # Wrap the urllib2 HTTPError into a swob HTTPException status = error.code body = error.fp.read() headers = error.hdrs.items() if status not in swob.RESPONSE_REASONS: # Generic status description in case of unknown status reasons. status = "%s Error" % status return swob.HTTPException(status=status, body=body, headers=headers) except urllib2.URLError as error: msg = 'There was a problem while contacting the thumbnailing service: %s' % \ error.reason return swob.HTTPServiceUnavailable(msg) # We successfully generated a thumbnail on the active DC, send the same request # blindly to the inactive DC to populate Swift there, not waiting for the response inactivedc_encodedurl = self.thumborify_url( reqorig, self.inactivedc_thumborhost) eventlet.spawn(self.inactivedc_request, thumbor_opener, inactivedc_encodedurl) # get the Content-Type. uinfo = upcopy.info() c_t = uinfo.gettype() resp = swob.Response(app_iter=upcopy, content_type=c_t) headers_whitelist = [ 'Content-Length', 'Content-Disposition', 'Last-Modified', 'Accept-Ranges', 'XKey', 'Thumbor-Engine', 'Server', 'Nginx-Request-Date', 'Nginx-Response-Date', 'Thumbor-Processing-Time', 'Thumbor-Processing-Utime', 'Thumbor-Request-Id', 'Thumbor-Request-Date' ] # add in the headers if we've got them for header in headers_whitelist: if (uinfo.getheader(header) != ''): resp.headers[header] = uinfo.getheader(header) # also add CORS; see also our CORS middleware resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def handle404(self, reqorig, url, container, obj): """ Return a webob.Response which fetches the thumbnail from the thumb host and returns it. Note also that the thumb host might write it out to Swift so it won't 404 next time. """ # go to the thumb media store for unknown files reqorig.host = self.thumbhost # upload doesn't like our User-agent, otherwise we could call it # using urllib2.url() proxy_handler = urllib2.ProxyHandler({'http': self.thumbhost}) redirect_handler = DumbRedirectHandler() opener = urllib2.build_opener(redirect_handler, proxy_handler) # Thumbor doesn't need (and doesn't like) the proxy thumbor_opener = urllib2.build_opener(redirect_handler) # Pass on certain headers from the caller squid to the scalers opener.addheaders = [] if reqorig.headers.get('User-Agent') is not None: opener.addheaders.append(('User-Agent', reqorig.headers.get('User-Agent'))) else: opener.addheaders.append(('User-Agent', self.user_agent)) for header_to_pass in ['X-Forwarded-For', 'X-Forwarded-Proto', 'Accept', 'Accept-Encoding', 'X-Original-URI']: if reqorig.headers.get(header_to_pass) is not None: opener.addheaders.append((header_to_pass, reqorig.headers.get(header_to_pass))) thumbor_opener.addheaders = opener.addheaders # At least in theory, we shouldn't be handing out links to originals # that we don't have (or in the case of thumbs, can't generate). # However, someone may have a formerly valid link to a file, so we # should do them the favor of giving them a 404. try: # break apach the url, url-encode it, and put it back together urlobj = list(urlparse.urlsplit(reqorig.url)) # encode the URL but don't encode %s and /s urlobj[2] = urllib2.quote(urlobj[2], '%/') encodedurl = urlparse.urlunsplit(urlobj) # Thumbor never needs URL mangling and it needs a different host if self.thumborhost: thumbor_reqorig = reqorig.copy() thumbor_reqorig.host = self.thumborhost thumbor_urlobj = list(urlparse.urlsplit(thumbor_reqorig.url)) thumbor_urlobj[2] = urllib2.quote(thumbor_urlobj[2], '%/') thumbor_encodedurl = urlparse.urlunsplit(thumbor_urlobj) # if sitelang, we're supposed to mangle the URL so that # http://upload.wm.o/wikipedia/commons/thumb/a/a2/Foo_.jpg/330px-Foo_.jpg # changes to # http://commons.wp.o/w/thumb_handler.php/a/a2/Foo_.jpg/330px-Foo_.jpg if self.backend_url_format == 'sitelang': match = re.match( r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)', encodedurl) if match: proj = match.group('proj') lang = match.group('lang') # and here are all the legacy special cases, imported from thumb_handler.php if(proj == 'wikipedia'): if(lang in ['meta', 'commons', 'internal', 'grants']): proj = 'wikimedia' if(lang in ['mediawiki']): lang = 'www' proj = 'mediawiki' hostname = '%s.%s.%s' % (lang, proj, self.tld) if(proj == 'wikipedia' and lang == 'sources'): # yay special case hostname = 'wikisource.%s' % self.tld # ok, replace the URL with just the part starting with thumb/ # take off the first two parts of the path # (eg /wikipedia/commons/); make sure the string starts # with a / encodedurl = 'http://%s/w/thumb_handler.php/%s' % ( hostname, match.group('path')) # add in the X-Original-URI with the swift got (minus the hostname) opener.addheaders.append( ('X-Original-URI', list(urlparse.urlsplit(reqorig.url))[2])) else: # ASSERT this code should never be hit since only thumbs # should call the 404 handler self.logger.warn("non-thumb in 404 handler! encodedurl = %s" % encodedurl) resp = webob.exc.HTTPNotFound('Unexpected error') return resp else: # log the result of the match here to test and make sure it's # sane before enabling the config match = re.match( r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)', encodedurl) if match: proj = match.group('proj') lang = match.group('lang') self.logger.warn( "sitelang match has proj %s lang %s encodedurl %s" % ( proj, lang, encodedurl)) else: self.logger.warn("no sitelang match on encodedurl: %s" % encodedurl) # To turn thumbor off and have thumbnail traffic served by image scalers, # replace the line below with this one: # upcopy = opener.open(encodedurl) upcopy = thumbor_opener.open(thumbor_encodedurl) except urllib2.HTTPError, error: # copy the urllib2 HTTPError into a webob HTTPError class as-is class CopiedHTTPError(webob.exc.HTTPError): code = error.code title = error.msg def html_body(self, environ): return self.detail def __init__(self): super(CopiedHTTPError, self).__init__( detail="".join(error.readlines()), headers=error.hdrs.items()) return CopiedHTTPError()
def base_request(self, method, container=None, name=None, prefix=None, headers=None, proxy=None, contents=None, full_listing=None, logger=None, additional_info=None): # Common request method trans_start = time() url = self.url if headers is None: headers = {} if self.token: headers['X-Auth-Token'] = self.token if container: url = '%s/%s' % (url.rstrip('/'), quote(container)) if name: url = '%s/%s' % (url.rstrip('/'), quote(name)) else: url += '?format=json' if prefix: url += '&prefix=%s' % prefix if proxy: proxy = urlparse.urlparse(proxy) proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers, data=contents) req.get_method = lambda: method conn = urllib2.urlopen(req) body = conn.read() try: body_data = json.loads(body) except ValueError: body_data = None trans_stop = time() if logger: sent_content_length = 0 for n, v in headers.items(): nl = n.lower() if nl == 'content-length': try: sent_content_length = int(v) break except ValueError: pass logger.debug("-> " + " ".join( quote(str(x) if x else "-", ":/") for x in ( strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)), method, url, conn.getcode(), sent_content_length, conn.info()['content-length'], trans_start, trans_stop, trans_stop - trans_start, additional_info ))) return [None, body_data]
def requestPage(self,etag,set_cookie,rank,site,server): try: global num_errors print "Checking site: " + str(urlHeader+site) sys.stdout.flush() robot_url = urlHeader+site+"/robots.txt" rp = robotparser.RobotFileParser() rp.set_url(robot_url) rp.read() if rp.errcode == 404 or (rp.can_fetch("*", urlHeader+site+'/') and rp.errcode == 200): try: time.sleep(60) opener = urllib2.build_opener(openanything.DefaultErrorHandler()) req = urllib2.Request(urlHeader+site) res = opener.open(req) etag1 = res.info()['ETag'] set_cookie1 = res.info()['Set-Cookie'] print str(etag1) + " and " + str(set_cookie1) print "Trying case 1" + str(urlHeader+site) time.sleep(60) # 1. Use set-cookie given by initial connection req = urllib2.Request(urlHeader+site, headers={"Cookie" : set_cookie1}) res = opener.open(req) etag2 = res.info()['ETag'] if etag1 == etag2: print "ETags are same." self.same_etag.append((site,etag1,set_cookie1)) self.etag_unchanged += 1 else: print "ETags are different." self.etag_changed += 1 time.sleep(60) print "Trying case 2" + str(urlHeader+site) # 2. Use ETag given by initial connection req = urllib2.Request(urlHeader+site, headers={"If-None-Match" : etag1}) res = opener.open(req) if res.getcode() == 304: print "Returned 304 Not Modified" self.cookie_304 += 1 else: try: set_cookie2 = res.info()['Set-Cookie'] if set_cookie1 == set_cookie2: print "Set the same coookie!" self.same_set_cookie.append((site,etag1,set_cookie1)) self.cookie_unchanged += 1 else: print "Receieved new cookie." self.cookie_changed += 1 except: print "Does not have Set-Cookie header." self.error_site.append(site) num_errors += 1 except: print "Problem acquiring either ETag or Set-Cookie in response packet." self.error_site.append(site) num_errors += 1 else: print "No permissions given to robot." self.error_site.append(site) self.err_permission += 1 except: print "Error connecting to robot." self.error_site.append(site) self.err_robot += 1 return site