def __open(self, url, params=None): if not isinstance(url, str): raise TypeError(url) if (not isinstance(params, dict)) and (params is not None): raise TypeError(params) headers = { "User-Agent": "/".join( ["watcherlab", "feed", "client", "python", self.__version]) } try: if params: params_bytes = bytes(urllib.parse.urlencode(params), encoding="utf-8") request = urllib.request.Request(url=url, headers=headers, data=params_bytes) else: request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request, timeout=self.__timeout) if response.getcode() == 200: return response else: return None except urllib.error.HTTPError as e: raise urllib.error.HTTPError(e.url, e.code, e.msg, e.hdrs, e.fp) except urllib.error.URLError as e: raise urllib.error.URLError(e.reason)
def spider(url): global sleep_time request = urllib.request.Request(url) # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了 request.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36') while True: try: time.sleep(sleep_time) response = urllib.request.build_opener().open(request) if response.getcode() == 200: html = response.read() response.close() if html is not None: if sleep_time > 5: sleep_time -= 1 return html else: continue except urllib.error.URLError as e: print(e.reason, ':', url) except socket.timeout as e: print("-----socket timout:", url) except: if sleep_time < 20: sleep_time += 1 print('********************do not know why it is happened!*****************') print("************************ now sleep time is: %d *********************" % sleep_time )
def _request(self, url, token=None, params=None): headers = dict() if not isinstance(url, str): raise TypeError(url) headers["User-Agent"] = "/".join( ["watcherlab", "feed", "client", "python", self.__version]) headers["Content-Type"] = "application/json" if token: headers["token"] = token try: if params: params_bytes = bytes(json.dumps(params), encoding="utf-8") else: params_bytes = None request = urllib.request.Request(url=url, headers=headers, data=params_bytes) response = urllib.request.urlopen( request, timeout=self.__timeout, context=ssl._create_unverified_context()) if response.getcode() == 200: return response else: return None except Exception as error: raise error
def download(self, url): if url is None: return None response = request.urlopen(url) if response.getcode() != 200: return None return response.read()
def request_until_succeed(url): req = urllib.request.Request(url) success = False while success is False: try: response = urllib.request.urlopen(req) if response.getcode() == 200: success = True except Exception as e: print (e) time.sleep(5) print ("Error for URL %s: %s" % (url, datetime.datetime.now())) return response.read().decode('utf8')
def web_crawl(self): ''' :return: ''' if self.pageUrl is None: self.pageHtml = '' request = urllib.request.Request(self.pageUrl) # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了 request.add_header( 'user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36' ) response = urllib.request.urlopen(request) if response.getcode() == 200: self.pageHtml = response.read()
def check_response(url): """ Check the response :param url: the full big_data_url URL :returns: 200/None (HTTP/FTP) if everything went well else it returns string containing the error code and message NOTE: check_url makes sure that the URL is 'valid' (it starts with http, ftp ...) check_response makes sure that the file exists """ try: response = urllib.request.urlopen(url) return response.getcode() except urllib.error.HTTPError as exp: # Return code error (e.g. 404, 501, ...) logger.error('HTTPError: {}'.format(exp.code)) return "{}: {}".format(exp.code, exp.reason) except urllib.error.URLError as exp: # Not an HTTP-specific error (e.g. connection refused, FTP errors) logger.error('URLError: {}'.format(exp.reason)) return "{}".format(exp.reason)
import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post('http://httpbin.org/post', payload) print("#No.0001:") print(ret.text) url = 'https://www.baidu.com/' req = request.Request(url) response = request.urlopen(req) print("#No.1==>type of response:") content = response.read() con1 = response.readlines() con2 = response.info() con3 = response.getcode() con4 = response.geturl() print(content) print(con1, "\n", con2, "\n", con3, "\n", con4, "\n") url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112' req2 = request.Request(url2) response2 = request.urlopen(req2) content2 = BeautifulSoup(response2.read(), "html5lib") print("#No.2==>", content2.title) print("#No.3==>", content2.find_all(name='h1')) namelist = content2.find_all(name='img') print("#No.4==>") for name in namelist:
file1 = open("Zipcodes.txt", "r") zipList = file1.read().split(',') print("total zipcodes: ",len(zipList)) apiKey = 'xxxxxxxxxxxxxxxxx' writeFile = open("WeartherData_Final", "a", encoding='utf-8') j = 0 for i in range(len(zipList)): try: print(i) #------------------20180318------------------ url = 'http://api.wunderground.com/api/'+str(apiKey)+'/almanac/geolookup/history_20180318/q/' \ + str(zipList[i]).strip() + '/test.json' #request = urllib.request.Request(url) response = urllib.request.urlopen(url) if(response.getcode() == 200): data = json.load(response) if ('location' in data and 'history' in data and 'almanac' in data): if ('date' in data['history'] and len(data['history']['dailysummary']) > 0 and len(data['history']['observations']) > 16): writeFile.write(json.dumps(data, ensure_ascii=False)) writeFile.write(",") else: print("stopped at i = ",i,zipList[i]) break #------------------20180319------------------ url = 'http://api.wunderground.com/api/'+str(apiKey)+'/almanac/geolookup/history_20180319/q/' \ + str(zipList[i]).strip() + '/test.json' response = urllib.request.urlopen(url) if (response.getcode() == 200): data = json.load(response)
import urllib.request, urllib.response, http.cookiejar, cookiecutter from bs4 import BeautifulSoup url = 'https://www.cnblogs.com/zdlfb/p/6130724.html' print('第一种方法') response = urllib.request.urlopen(url) print(response.getcode()) print(len(response.read())) print(response.read()) # print('第二张方法') # request=urllib.request.request(url) # request.add_header('user-agent','mozilla/5.0')#爬虫伪装成浏览器 # response1=urllib.request.urlopen(request) # print(response1.getcode()) # print(len(response1.read())) # print('第三种方法') # cj=http.cookiejar # opener=urllib.build_opener(urllib.HttpCookieProcessor(cj)) # urllib.install_opener(opener) # response3=urllib.request.urlopen(url) # print(response3.getcode()) # print(response3.read()) soup = BeautifulSoup() #html文档字符串,html解析器,html文档编码