def base_request(self, method, container=None, name=None, prefix=None, headers={}, proxy=None, contents=None, full_listing=None): # Common request method url = self.url if self.token: headers['X-Auth-Token'] = self.token if container: url = '%s/%s' % (url.rstrip('/'), quote(container)) if name: url = '%s/%s' % (url.rstrip('/'), quote(name)) url += '?format=json' if prefix: url += '&prefix=%s' % prefix if proxy: proxy = urlparse.urlparse(proxy) proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers, data=contents) req.get_method = lambda: method urllib2.urlopen(req) conn = urllib2.urlopen(req) body = conn.read() try: body_data = json.loads(body) except ValueError: body_data = None return [None, body_data]
def main(self, start_url, block_extensions=['.pdf','.gif','.jpg','.JPG','.PNG','.png','.wav','.mp3','.wma'], max_urls = 100): # Set user agent string opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'), #('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en,en-zh;q=0.8'), #('Cache-Control', 'max-age=0'), #('Connection', 'keep-alive') ] urllib2.install_opener(opener) # Get base info (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url) fragments = (scheme, netloc, '', '', '', '') base_url = urlparse.urlunparse(fragments) #print "base_url -> ", base_url mainLink = LinkInfo(None,base_url,u'Main',0,u'first page') self.assignID(mainLink) urls_queue = set([mainLink]) urls_crawled = set() urls_crawled2 = set() pool = eventlet.GreenPool(20) counter = 0 tmpC = 0 while True: #Infinite loop sanity check counter +=1 if counter > max_urls: break for url, body in pool.imap(self.fetch, urls_queue): # Remove this url from the queue set urls_queue = urls_queue - set([url]) # Add url to crawled set urls_crawled = urls_crawled.union(set([url])) urls_crawled2 = urls_crawled2.union(set([url])) # Extract links links = self.extract_links(url, body, block_extensions) if ( links == None ):return urls_crawled if tmpC == 100000 : return urls_crawled tmpC += 1 for link in links: if link not in urls_queue and link not in urls_crawled: # Add link to queue urls_queue = urls_queue.union(set([link])) print u"[valid]: link -> ", link.link return urls_crawled
def main(self, start_url, block_extensions=['.pdf'], max_urls = 100): # Set user agent string opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'), ('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en;q=0.8'), ('Cache-Control', 'max-age=0'), ('Connection', 'keep-alive') ] urllib2.install_opener(opener) # Get base info (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url) fragments = (scheme, netloc, '', '', '', '') base_url = urlparse.urlunparse(fragments) urls_queue = set([base_url]) urls_crawled = set() pool = eventlet.GreenPool(20) counter = 0 while True: #Infinite loop sanity check counter +=1 if counter > max_urls: break for url, body in pool.imap(self.fetch, urls_queue): # Remove this url from the queue set urls_queue = urls_queue - set([url]) # Add url to crawled set urls_crawled = urls_crawled.union(set([url])) # Extract links links = self.extract_links(url, body, block_extensions) for link in links: if link not in urls_queue and link not in urls_crawled: # Add link to queue urls_queue = urls_queue.union(set([link])) return urls_crawled
def proxyTest(self, row): proxy = row[0] + ":" + row[1] if 'HTTPS' in row[3]: proxies = {"https": "https://" + proxy} else: proxies = {"http": "http://" + proxy} ip = row[0] port = row[1] theProxy = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(theProxy) urllib2.install_opener(opener) testResult = 'ok!' try: webcode = urllib2.urlopen("https://www.fliggy.com/", timeout=10).getcode() #logger.info("Proxy %s is ok" % proxy) except Exception, e: #logger.warn("Proxy %s is nolonger ok" % proxy) self.clean(ip=ip, port=port) testResult = 'nolonger ok!'
def base_request(self, method, container=None, name=None, prefix=None, headers=None, proxy=None, contents=None, full_listing=None, logger=None, additional_info=None): # Common request method trans_start = time() url = self.url if headers is None: headers = {} if self.token: headers['X-Auth-Token'] = self.token if container: url = '%s/%s' % (url.rstrip('/'), quote(container)) if name: url = '%s/%s' % (url.rstrip('/'), quote(name)) else: url += '?format=json' if prefix: url += '&prefix=%s' % prefix if proxy: proxy = urlparse.urlparse(proxy) proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers, data=contents) req.get_method = lambda: method conn = urllib2.urlopen(req) body = conn.read() try: body_data = json.loads(body) except ValueError: body_data = None trans_stop = time() if logger: sent_content_length = 0 for n, v in headers.items(): nl = n.lower() if nl == 'content-length': try: sent_content_length = int(v) break except ValueError: pass logger.debug("-> " + " ".join( quote(str(x) if x else "-", ":/") for x in ( strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)), method, url, conn.getcode(), sent_content_length, conn.info()['content-length'], trans_start, trans_stop, trans_stop - trans_start, additional_info ))) return [None, body_data]
def base_request(self, method, container=None, name=None, prefix=None, headers=None, proxy=None, contents=None, full_listing=None, logger=None, additional_info=None): # Common request method trans_start = time() url = self.url if headers is None: headers = {} if self.token: headers['X-Auth-Token'] = self.token if container: url = '%s/%s' % (url.rstrip('/'), quote(container)) if name: url = '%s/%s' % (url.rstrip('/'), quote(name)) else: url += '?format=json' if prefix: url += '&prefix=%s' % prefix if proxy: proxy = urlparse.urlparse(proxy) proxy = urllib2.ProxyHandler({proxy.scheme: proxy.netloc}) opener = urllib2.build_opener(proxy) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers, data=contents) req.get_method = lambda: method urllib2.urlopen(req) conn = urllib2.urlopen(req) body = conn.read() try: body_data = json.loads(body) except ValueError: body_data = None trans_stop = time() if logger: sent_content_length = 0 for n, v in headers.items(): nl = n.lower() if nl == 'content-length': try: sent_content_length = int(v) break except ValueError: pass logger.debug("-> " + " ".join( quote(str(x) if x else "-", ":/") for x in (strftime('%Y-%m-%dT%H:%M:%S', gmtime(trans_stop)), method, url, conn.getcode(), sent_content_length, conn.info()['content-length'], trans_start, trans_stop, trans_stop - trans_start, additional_info))) return [None, body_data]
def password_protected_page_downloader(dbConn, log): """ *get a page that is behind HTTPS authentication password protection* **Key Arguments:** - ``dbConn`` -- mysql database connection - ``log`` -- logger - ``___`` -- **Return:** - None """ ################ > IMPORTS ################ ## STANDARD LIB ## ## THIRD PARTY ## ## LOCAL APPLICATION ## import commands import urllib2 theurl = 'https://groups.google.com/a/pessto.org/group/alerts/manage_members/alerts.csv' username = '******' password = '******' # a great password passman = urllib2.HTTPPasswordMgrWithDefaultRealm() # this creates a password manager passman.add_password(None, theurl, username, password) # because we have put None at the start it will always # use this username/password combination for urls # for which `theurl` is a super-url authhandler = urllib2.HTTPBasicAuthHandler(passman) # create the AuthHandler opener = urllib2.build_opener(authhandler) urllib2.install_opener(opener) # All calls to urllib2.urlopen will now use our handler # Make sure not to include the protocol in with the URL, or # HTTPPasswordMgrWithDefaultRealm will be very confused. # You must (of course) use it when fetching the page though. pagehandle = urllib2.urlopen(theurl) # authentication is now handled automatically for us ################ > VARIABLE SETTINGS ###### # command = 'wget --output-document=- --quiet --http-user=david.young --http-password=spac3d0ct0r https://groups.google.com/a/pessto.org/group/alerts/manage_members/alerts.csv' # status, text = commands.getstatusoutput(command) # url = "https://david.young:[email protected]/a/pessto.org/group/alerts/manage_members/alerts.csv" # try: # urllib2.urlopen(urllib2.Request(url)) # except urllib2.HTTPError, e: # print e.headers # print e.headers.has_key('WWW-Authenticate') ################ >ACTION(S) ################ # log.debug('status %s' % (status,)) # log.debug('text %s' % (text,)) return
def main(self, start_url, block_extensions=[ '.pdf', '.gif', '.jpg', '.JPG', '.PNG', '.png', '.wav', '.mp3', '.wma' ], max_urls=100): # Set user agent string opener = urllib2.build_opener() opener.addheaders = [ ('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1' ), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ), ('Accept-Charset', 'utf-8,gbk;q=0.7,*;q=0.3'), #('Accept-Encoding', 'gzip,deflate,sdch'), ('Accept-Language', 'en-US,en,en-zh;q=0.8'), #('Cache-Control', 'max-age=0'), #('Connection', 'keep-alive') ] urllib2.install_opener(opener) # Get base info (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(start_url) fragments = (scheme, netloc, '', '', '', '') base_url = urlparse.urlunparse(fragments) #print "base_url -> ", base_url mainLink = LinkInfo(None, base_url, u'Main', 0, u'first page') self.assignID(mainLink) urls_queue = set([mainLink]) urls_crawled = set() urls_crawled2 = set() pool = eventlet.GreenPool(20) counter = 0 tmpC = 0 while True: #Infinite loop sanity check counter += 1 if counter > max_urls: break for url, body in pool.imap(self.fetch, urls_queue): # Remove this url from the queue set urls_queue = urls_queue - set([url]) # Add url to crawled set urls_crawled = urls_crawled.union(set([url])) urls_crawled2 = urls_crawled2.union(set([url])) # Extract links links = self.extract_links(url, body, block_extensions) if (links == None): return urls_crawled if tmpC == 100000: return urls_crawled tmpC += 1 for link in links: if link not in urls_queue and link not in urls_crawled: # Add link to queue urls_queue = urls_queue.union(set([link])) print u"[valid]: link -> ", link.link return urls_crawled