def download_url_list(): cookie = cookielib.CookieJar() handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) url = 'http://www.jd.com' response = opener.open(url) html = response.read().decode('utf-8', 'ignore') start_pattern = 'class=\"cate_menu_lk\"' end_pattern = '</a>' start_position = html.find(start_pattern) link_str = '' while start_position > 0: html = html[start_position + len(start_pattern):] end_position = html.find(end_pattern) if end_position > 0: link_str += html[0:end_position] start_position = html.find(start_pattern) #print link_str url_re = '\/\/[a-z.\/\d\-]+' url_list = [] for url in re.findall(url_re, link_str): url = 'http:' + url url_list.append(url) #print url_list return url_list
def _register_agent(self): register_name = self.app.config.get('TRCDASH_REGISTER_AS') if not register_name: register_name = socket.gethostname() url_args = { 'name': register_name, 'port': self.app.config.get('TRCDASH_PORT', self.DEFAULT_PORT), } register_url = '%s/register?%s' % ( self.app.config['TRCDASH_REGISTER_TO'], urllib.urlencode(url_args)) if 'TRCDASH_AUTH_USERNAME' in self.app.config and 'TRCDASH_AUTH_PASSWORD' in self.app.config: auth_handler = urllib3.HTTPBasicAuthHandler() auth_handler.add_password( realm='TRCDash login required', uri=register_url, user=self.app.config['TRCDASH_AUTH_USERNAME'], passwd=self.app.config['TRCDASH_AUTH_PASSWORD']) opener = urllib3.build_opener(auth_handler) urllib3.install_opener(opener) try: urllib3.urlopen(register_url) except urllib3.HTTPError as e: logger.error('Failed to register agent to "%s": %s', register_url, e)
def crawl(parent_url): print('do crawl') cookie = http.cookiejar.CookieJar() handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) response = opener.open(parent_url) html = response.read().decode('gbk', 'ignore') html = html.replace('\\/', '/') new_url_list = [] sku_information = [] links = re.findall(r'href\=\"(\/\/[a-zA-Z0-9\.\/\-]+)\"', html) for link in links: # python 多个and啥意思来着 if link.find('jd.com') > 0 and link.find( 'club.jd.com') < 0 and link.find( 'item.m.jd.com') < 0 and link.find( 'help.jd.com') < 0 and link.find('yp.jd.com') < 0: link = 'http:' + link new_url_list.append(link) if parent_url.find('item.jd.com') >= 0: id_reg_exp = '[\d]+' all_found = re.findall(id_reg_exp, parent_url) sku_information.append(all_found[0]) for item in retrieve_sku_info(html): sku_information.append(item) print('sku', sku_information) #print sku_information return new_url_list, sku_information
def init(self): self.AUTH_MGR = urllib3.HTTPPasswordMgrWithDefaultRealm() self.AUTH_MGR.add_password(None, "https://%s/" % (self.hostname), r'%s\%s' % (self.domain, self.username), self.password) self.AUTH = HTTPNtlmAuthHandler.HTTPNtlmAuthHandler(self.AUTH_MGR) self._handler = urllib3.HTTPHandler(debuglevel=self.debug) self._opener = urllib3.build_opener(self.AUTH) urllib3.install_opener(self._opener)
def doHttpWithCookie(self, url, data={}, save_cookie = False): cookie = cookielib.CookieJar() cookie.load(self.cookie_filename, ignore_discard=True, ignore_expires=True) if save_cookie: cookie = cookielib.MozillaCookieJar(self.cookie_filename) handler = urllib3.HTTPCookieProcessor(cookie) opener = urllib3.build_opener(handler) response = opener.open(url) for item in cookie: print('Name = ' + item.name) print('Value = ' + item.value) if save_cookie: cookie.save(ignore_discard=True, ignore_expires=True)
def content_test(url, badip): try: request = urllib3.Request(url) opened_request = urllib3.build_opener().open(request) html_content = opened_request.read() retcode = opened_request.code matches = retcode == 200 matches = matches and re.findall(badip, html_content) return len(matches) == 0 except: return False
def register_openers(): """Register the streaming http handlers in the global urllib3 default opener object. Returns the created OpenerDirector object.""" handlers = [StreamingHTTPHandler, StreamingHTTPRedirectHandler] if hasattr(httplib2, "HTTPS"): handlers.append(StreamingHTTPSHandler) opener = urllib3.build_opener(*handlers) urllib3.install_opener(opener) return opener
def profile_edit_linkdin(request): # # driver = webdriver.Firefox(executable_path=r'your\path\geckodriver.exe') # I actually used the chromedriver and did not test firefox, but it should work. # # profile_link = "https://www.linkedin.com/in/ashish-ranjan-753429136/" # # driver.get(profile_link) # # html = driver.page_source # # soup = bs.BeautifulSoup(html,'lxml') # specify parser or it will auto-select for you # # summary = soup.find('section', {"id": "summary"}) # # print (summary.getText()) urlopener = urllib3.build_opener() urlopener.addheaders = [('User-agent', 'Mozilla/5.0')] sauce = urlopener.open('https://www.linkedin.com/in/aditya-mittal-709a1162/').read() #sauce = urllib3.urlopen('https://www.linkedin.com/in/ashish-ranjan-753429136/').read() soup=bs.BeautifulSoup(sauce,'lxml') tag1=soup.findAll('ul') #data=soup.findAll('section' ,attrs={'class':'profile-section','id':'topcard'}) # for x in soup.findAll('section' ,attrs={'class':'profile-section','id':'topcard'}): # print(x) #data = soup.find('section', attrs={'class': 'profile-section', 'id': 'topcard'}) name= soup.find('section' ,attrs={'class':'profile-section','id':'topcard'}).find('h1',attrs={'class':'fn'}) final_name=name.text location = soup.find('section', attrs={'class': 'profile-section', 'id': 'topcard'}).find('span', attrs={'class': 'locality'}) final_location=location.text skills_soup=soup.find('section', attrs={'class': 'profile-section', 'id': 'skills'}).find_all('li') skills=[""]*len(skills_soup) i=0 for li in skills_soup: skills[i]=skills_soup[i].text i=i+1 final_skills=skills[:len(skills_soup)-3] #id=data.findAll('section' ,attrs={'id':'topcard'}) #print(soup.prettify()) #print(soup.title) #print(vol) #print(skills[:len(vol)-3]) #for ski in skills: # print(ski) print(final_name) print(final_location) print(final_skills) alumni=get_object_or_404(Alumni, user=request.user) if request.method == "POST": form = ProfileForm(request.POST,instance=alumni) if form.is_valid(): alumni = form.save(commit=False) alumni.save() return redirect('home:profile') else: form = ProfileForm() return render(request, 'registration/edit_profile.html', {'form': form,})
def deploy(): opts, args = parse_opts() if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) _delete_old_package() urllib3.install_opener(urllib3.build_opener(HTTPRedirectHandler)) if opts.list_targets: for name, target in _get_targets().items(): print ("%-20s %s" % (name, target['url'])) return if opts.list_projects: target = _get_target(opts.list_projects) req = urllib3.Request(_url(target, 'listprojects.json')) _add_auth_header(req, target) f = urllib3.urlopen(req) projects = json.loads(f.read())['projects'] print (os.linesep.join(projects)) return tmpdir = None # build egg only if opts.build_egg: egg, tmpdir = _build_egg() _log("Writing egg to %s" % opts.build_egg) shutil.copyfile(egg, opts.build_egg) elif opts.deploy_all_targets: version = None for name, target in _get_targets().items(): if version is None: version = _get_version(target, opts) _build_egg_and_deploy_target(target, version, opts) else: # buld egg and deploy target_name = _get_target_name(args) target = _get_target(target_name) version = _get_version(target, opts) exitcode, tmpdir = _build_egg_and_deploy_target(target, version, opts) if tmpdir: if opts.debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) _delete_old_package()
def redeem_trusted_ticket(self, view_to_redeem, trusted_ticket, site='default'): trusted_view_url = "{}/trusted/{}".format(self.tableau_server_url, trusted_ticket) if site.lower() != 'default': trusted_view_url += "/t/{}/views/{}".format(site, view_to_redeem) else: trusted_view_url += "/views/{}".format(view_to_redeem) opener = urllib3.build_opener(urllib3.HTTPHandler) request = urllib3.Request(trusted_view_url) try: response = opener.open(request) except urllib3.HTTPError as e: if e.code >= 500: raise raw_error_response = e.fp.read()
def get_trusted_ticket_for_user(self, username, site='default', ip=None): trusted_url = self.tableau_server_url + "/trusted" opener = urllib3.build_opener(urllib3.HTTPHandler) request = urllib3.Request(trusted_url) post_data = u"username={}".format(username) if site.lower() != 'default': post_data += u"&target_site={}".format(site) request.add_data(post_data) trusted_ticket_response = opener.open(request) try: ticket = trusted_ticket_response.read() if ticket == '-1' or not ticket: raise NoResultsException('Ticket generation was not complete.') else: return ticket except urllib3.HTTPError as e: if e.code >= 500: raise raw_error_response = e.fp.read()
def Brower(url): login_page = "https://10.64.70.225/cgi-bin/logon.cgi" login_data = "usrname=admin&passwd=admin&isCookieEnable=1&action=on&wrong_passwd=%3C%21--invalid_passwd_flag--%3E" # get from fiddler try: cj = cookielib.CookieJar() opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cj)) opener.addheaders = [ ('User-agent', 'Mozilla/4.0 (compatible; MSIE 8.0;\ Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152;\ .NET CLR 3.5.30729; MS-RTC LM 8; InfoPath.2; CIBA; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322)' ) ] opener.open(login_page, login_data) op = opener.open(url) data = op.read() return data except Exception: print(str(Exception))
def linkWithPorxy(self, line): lineList = line.split('\t') protocol = lineList[2].lower() server = protocol + r'://' + lineList[0] + ':' + lineList[1] opener = urllib3.build_opener(urllib3.ProxyHandler({protocol:server})) urllib3.install_opener(opener) try: response = urllib3.urlopen(self.URL, timeout=self.timeout) except: print('%s connect failed' %server) return else: try: str = response.read() except: print('%s connect failed' %server) return if self.regex.search(str): print('%s connect success .........' %server) self.aliveList.append(line)
def profile_edit_manual(request): # # driver = webdriver.Firefox(executable_path=r'your\path\geckodriver.exe') # I actually used the chromedriver and did not test firefox, but it should work. # # profile_link = "https://www.linkedin.com/in/ashish-ranjan-753429136/" # # driver.get(profile_link) # # html = driver.page_source # # soup = bs.BeautifulSoup(html,'lxml') # specify parser or it will auto-select for you # # summary = soup.find('section', {"id": "summary"}) # # print (summary.getText()) urlopener = urllib3.build_opener() urlopener.addheaders = [('User-agent', 'Mozilla/5.0')] sauce = urlopener.open('https://www.linkedin.com/in/deepakgouda/').read() #sauce = urllib3.urlopen('https://www.linkedin.com/in/ashish-ranjan-753429136/').read() soup=bs.BeautifulSoup(sauce,'lxml') print(soup) alumni=get_object_or_404(Alumni, user=request.user) if request.method == "POST": form = ProfileForm(request.POST,instance=alumni) if form.is_valid(): alumni = form.save(commit=False) alumni.save() return redirect('home:profile') else: form = ProfileForm() return render(request, 'registration/edit_profile.html', {'form': form,})
def buildhtmlheader(self): """generate HTML header content""" if self.drilldown_flag: self.add_JSsource( 'http://code.highcharts.com/modules/drilldown.js') if self.offline: opener = urllib3.build_opener() opener.addheaders = [('User-Agent', 'Mozilla/5.0')] self.header_css = [ '<style>%s</style>' % opener.open(h).read() for h in self.CSSsource ] self.header_js = [ '<script type="text/javascript">%s</script>' % opener.open(h).read() for h in self.JSsource ] else: self.header_css = [ '<link href="%s" rel="stylesheet" />' % h for h in self.CSSsource ] self.header_js = [ '<script type="text/javascript" src="%s"></script>' % h for h in self.JSsource ] self.htmlheader = '' for css in self.header_css: self.htmlheader += css for js in self.header_js: self.htmlheader += js
#-*- coding:utf-8 -*- import urllib, urllib3 from http.cookiejar import CookieJar username = '******' password = '' cj = CookieJar() opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cj)) login_data = urllib.urlencode({'mb_id' : username, 'mb_password' : password}) opener.open('http://www.jungol.co.kr/bbs/login.php', login_data) resp = opener.open('http://www.jungol.co.kr/theme/jungol/contest.php?cid=404') print(resp.read())
import urllib3 import cookie print('start') cookie = cookielib.CookieJar() opener = urllib3.build_opener(urllib3.HTTPCookieProcessor(cookie)) response = opener.open('http://www.zhihu.com') for item in cookie: print(item.name +':' item.value) print('over')
def openDebug(self): httpHandler = urllib3.HTTPHandler(debuglevel=1) httpsHandler = urllib3.HTTPSHandler(debuglevel=1) opener = urllib3.build_opener(httpHandler, httpsHandler) urllib3.install_opener(opener)
def setProxy(self, proxy_info): proxy_handler = urllib3.ProxyHandler({"http": "http://%(host)s:%(port)d"%proxy_info}) opener = urllib3.build_opener(proxy_handler) urllib3.install_opener(opener)
def UrlRequest(str_symbol,start,end): #sym=SymbolCheck(symbol) mainurl="http://quotes.money.163.com/service/chddata.html?" #http://quotes.money.163.com/service/chddata.html?code=1000593&start=19960312&end=20150623&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP options="TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP" suburl="code=%s&start=%d&end=%d&fields=%s" % (str_symbol, start, end, options) #print mainurl+suburl #header=False header=True testpost=False if testpost == True: url=mainurl user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values = {'code' : str_symbol, 'start' : start, 'end' : end, 'fields' : options } headers = { 'User-Agent' : user_agent } else : url=mainurl+suburl i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"} Debug=False if Debug==True: httpHandler = urllib3.HTTPHandler(debuglevel=1) httpsHandler = urllib3.HTTPSHandler(debuglevel=1) opener = urllib3.build_opener(httpHandler, httpsHandler) urllib3.install_opener(opener) #useipv4=True useipv4=False retry =0 MaxRetry=3 while True : try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101 Firefox/52.0' } requests.packages.urllib3.disable_warnings() # print(url) r= requests.get(url, headers=headers, verify=False) r.encoding='UTF-8' page = r.text return page tout=120 if useipv4==True: urlopen_IPv4= urllib3.build_opener(HTTPHandler_IPv4).open response= urlopen_IPv4(url, timeout=tout) break if header==True: if testpost == True: data = urllib3.urlencode(values) print(data) req = urllib3.Request(url, data, headers) else: req = urllib3.Request(url, headers=i_headers) response = urllib3.urlopen(req, timeout=tout) else: response = urllib3.urlopen(url, timeout=tout) break except Exception as e: if hasattr(e,'code'): print('code:{0}'.format(e.code)) # raise urllib3.HTTPError except Exception as e: if hasattr(e,'reason'): print('reason:{0}'.format(e.reason)) if hasattr(e,'code'): print('code:{0}'.format(e.code)) retry +=1 if retry > MaxRetry: print('More than max %d' % MaxRetry) raise urllib3.URLError else: print('Try request again ...') else : pass #print "Down data ok" return response
import urllib3 import cookielib from bs4 import BeautifulSoup #设置代理IP proxy_support = urllib3.ProxyHandler({'http':'120.197.234.164:80'}) #设置cookie cookie_support = urllib3.HTTPCookieProcessor(cookielib.LWPCookieJar()) opener = urllib3.build_opener(proxy_support,cookie_support,urllib.HTTPHandler) urllib3.install_opener(opener) #开始的URL #hosturl = "http://www.renren.com" hosturl = "http://mail.163.com/" #接受表单数据的URL #posturl = "http://www.renren.com/ajaxLogin/login" posturl = "https://mail.163.com/entry/cgi/ntesdoor?df=mail163_letter&from=web&funcid=loginone&iframe=1&language=-1&passtype=1&product=mail163&net=e&style=-1&race=118_35_39_bj&[email protected]" #发送表单数据 postdata = urllib.urlencode( { "username":"******", "password":"******" } ) #设置表头 headers = { #'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0/', #'Referer':'http://www.renren.com/' 'User-Agent':"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0", 'Referer':'http://mail.163.com/' } #生成HTTP请求 req =urllib.Request(
def handleCategoryMatch(data, http_object): logintype = data['login_type'] creds1 = data['defaultCreds'] origTarget = http_object.remote_system target = origTarget + data['defaultPath'][0] if logintype[0] == 'http_post': redirect = False try: req = urllib3.Request(origTarget) opener = urllib3.build_opener(SmartRedirectHandler()) rsp = opener.open(req) code = rsp.getcode() if code == 301 or code == 302: target = rsp.geturl() redirect = True except urllib3.URLError as e: raise Exception("There was an error: {}".format(e)) htmlSource = parseURL(origTarget, '') inputs = getInputFields(htmlSource) if inputs[1] is not None: target = updateTarget(http_object.remote_system, inputs[1]) inputs = inputs[0] for x in creds1: seperated = re.split(',\s*', x) for y in seperated: creds = y.split(':') username = creds[0] password = creds[1] if logintype[0] == 'http_auth': check = httpAuth(target, username, password) if check: http_object.default_creds = "Default creds are valid: {}".format( y) http_object.category = "successfulLogin" http_object._remote_login = target break elif logintype[0] == 'http_post': if inputs is not None: postData = getPostData(inputs, username, password) if loginPost(origTarget, target, postData, data): http_object.default_creds = "Default creds are valid: {}".format( y) http_object.category = "successfulLogin" http_object._remote_login = target break else: http_object.category = "identifiedLogin" else: pass return http_object
from bs4 import BeautifulSoup, SoupStrainer import urllib3 import re opener = urllib3.build_opener() url = 'http://www.bbc.co.uk/news' soup = BeautifulSoup(opener.open(url), "lxml") titleTag = soup.html.head.title print(titleTag.string) titles = soup.find_all('span', {'class': 'title-link__title-text'}) headlines = [t.text for t in titles] print(headlines)
def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s" urlGetData = '' if hasattr(tweetCriteria, 'username'): urlGetData += ' from:' + tweetCriteria.username if hasattr(tweetCriteria, 'querySearch'): urlGetData += ' ' + tweetCriteria.querySearch if hasattr(tweetCriteria, 'near'): urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within if hasattr(tweetCriteria, 'since'): urlGetData += ' since:' + tweetCriteria.since if hasattr(tweetCriteria, 'until'): urlGetData += ' until:' + tweetCriteria.until if hasattr(tweetCriteria, 'topTweets'): if tweetCriteria.topTweets: url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s" url = url % (urllib.quote(urlGetData), urllib.quote(refreshCursor)) headers = [ ('Host', "twitter.com"), ('User-Agent', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36" ), ('Accept', "application/json, text/javascript, */*; q=0.01"), ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), ('X-Requested-With', "XMLHttpRequest"), ('Referer', url), ('Connection', "keep-alive") ] if proxy: opener = urllib2.build_opener( urllib2.ProxyHandler({ 'http': proxy, 'https': proxy }), urllib2.HTTPCookieProcessor(cookieJar)) else: opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookieJar)) opener.addheaders = headers try: response = opener.open(url) jsonResponse = response.read() except: print( "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib.quote(urlGetData)) sys.exit() return dataJson = json.loads(jsonResponse) return dataJson