Esempio n. 1
0
    def __open(self, url, params=None):
        if not isinstance(url, str):
            raise TypeError(url)

        if (not isinstance(params, dict)) and (params is not None):
            raise TypeError(params)

        headers = {
            "User-Agent":
            "/".join(
                ["watcherlab", "feed", "client", "python", self.__version])
        }

        try:
            if params:
                params_bytes = bytes(urllib.parse.urlencode(params),
                                     encoding="utf-8")
                request = urllib.request.Request(url=url,
                                                 headers=headers,
                                                 data=params_bytes)
            else:
                request = urllib.request.Request(url=url, headers=headers)

            response = urllib.request.urlopen(request, timeout=self.__timeout)

            if response.getcode() == 200:
                return response
            else:
                return None

        except urllib.error.HTTPError as e:
            raise urllib.error.HTTPError(e.url, e.code, e.msg, e.hdrs, e.fp)

        except urllib.error.URLError as e:
            raise urllib.error.URLError(e.reason)
Esempio n. 2
0
def spider(url):
    global sleep_time
    request = urllib.request.Request(url)
    # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了
    request.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
    while True:
        try:
            time.sleep(sleep_time)
            response = urllib.request.build_opener().open(request)
            if response.getcode() == 200:
                html = response.read()
                response.close()
                if html is not None:
                    if sleep_time > 5:
                        sleep_time -= 1
                    return html
                else:
                    continue
        except urllib.error.URLError as e:
            print(e.reason, ':', url)
        except socket.timeout as e:
            print("-----socket timout:", url)
        except:
            if sleep_time < 20:
                sleep_time += 1
            print('********************do not know why it is happened!*****************')
            print("************************ now sleep time is: %d *********************" % sleep_time )
Esempio n. 3
0
    def _request(self, url, token=None, params=None):
        headers = dict()

        if not isinstance(url, str):
            raise TypeError(url)

        headers["User-Agent"] = "/".join(
            ["watcherlab", "feed", "client", "python", self.__version])
        headers["Content-Type"] = "application/json"

        if token:
            headers["token"] = token

        try:
            if params:
                params_bytes = bytes(json.dumps(params), encoding="utf-8")
            else:
                params_bytes = None

            request = urllib.request.Request(url=url,
                                             headers=headers,
                                             data=params_bytes)
            response = urllib.request.urlopen(
                request,
                timeout=self.__timeout,
                context=ssl._create_unverified_context())

            if response.getcode() == 200:
                return response
            else:
                return None

        except Exception as error:
            raise error
Esempio n. 4
0
    def download(self, url):
        if url is None:
            return None

        response = request.urlopen(url)
        if response.getcode() != 200:
            return None
        return response.read()
def request_until_succeed(url):
    req = urllib.request.Request(url)
    success = False
    while success is False:
        try: 
            response = urllib.request.urlopen(req)
            if response.getcode() == 200: 
                success = True
        except Exception as e:
            print (e)
            time.sleep(5)
            
            print ("Error for URL %s: %s" % (url, datetime.datetime.now()))

    return response.read().decode('utf8')
Esempio n. 6
0
    def web_crawl(self):
        '''

        :return:
        '''
        if self.pageUrl is None:
            self.pageHtml = ''
        request = urllib.request.Request(self.pageUrl)
        # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了
        request.add_header(
            'user-agent',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36'
        )
        response = urllib.request.urlopen(request)
        if response.getcode() == 200:
            self.pageHtml = response.read()
Esempio n. 7
0
def check_response(url):
    """
    Check the response
    :param url: the full big_data_url URL
    :returns: 200/None (HTTP/FTP) if everything went well else it returns
    string containing the error code and message
    NOTE:
    check_url makes sure that the URL is 'valid' (it starts with http, ftp ...)
    check_response makes sure that the file exists
    """
    try:
        response = urllib.request.urlopen(url)
        return response.getcode()
    except urllib.error.HTTPError as exp:
        # Return code error (e.g. 404, 501, ...)
        logger.error('HTTPError: {}'.format(exp.code))
        return "{}: {}".format(exp.code, exp.reason)
    except urllib.error.URLError as exp:
        # Not an HTTP-specific error (e.g. connection refused, FTP errors)
        logger.error('URLError: {}'.format(exp.reason))
        return "{}".format(exp.reason)
Esempio n. 8
0
import requests

payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post('http://httpbin.org/post', payload)
print("#No.0001:")
print(ret.text)

url = 'https://www.baidu.com/'

req = request.Request(url)
response = request.urlopen(req)
print("#No.1==>type of response:")
content = response.read()
con1 = response.readlines()
con2 = response.info()
con3 = response.getcode()
con4 = response.geturl()
print(content)
print(con1, "\n", con2, "\n", con3, "\n", con4, "\n")

url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112'
req2 = request.Request(url2)
response2 = request.urlopen(req2)
content2 = BeautifulSoup(response2.read(), "html5lib")

print("#No.2==>", content2.title)
print("#No.3==>", content2.find_all(name='h1'))

namelist = content2.find_all(name='img')
print("#No.4==>")
for name in namelist:
Esempio n. 9
0
file1 = open("Zipcodes.txt", "r")
zipList = file1.read().split(',')
print("total zipcodes: ",len(zipList))
apiKey = 'xxxxxxxxxxxxxxxxx'

writeFile = open("WeartherData_Final", "a", encoding='utf-8')
j = 0
for i in range(len(zipList)):
  try:
    print(i)
    #------------------20180318------------------
    url = 'http://api.wunderground.com/api/'+str(apiKey)+'/almanac/geolookup/history_20180318/q/' \
          + str(zipList[i]).strip() + '/test.json'
    #request = urllib.request.Request(url)
    response = urllib.request.urlopen(url)
    if(response.getcode() == 200):
      data = json.load(response)
      if ('location' in data and 'history' in data and 'almanac' in data):
        if ('date' in data['history'] and len(data['history']['dailysummary']) > 0 and len(data['history']['observations']) > 16):
          writeFile.write(json.dumps(data, ensure_ascii=False))
          writeFile.write(",")
    else:
      print("stopped at i = ",i,zipList[i])
      break

    #------------------20180319------------------
    url = 'http://api.wunderground.com/api/'+str(apiKey)+'/almanac/geolookup/history_20180319/q/' \
          + str(zipList[i]).strip() + '/test.json'
    response = urllib.request.urlopen(url)
    if (response.getcode() == 200):
      data = json.load(response)
Esempio n. 10
0
import urllib.request, urllib.response, http.cookiejar, cookiecutter
from bs4 import BeautifulSoup
url = 'https://www.cnblogs.com/zdlfb/p/6130724.html'
print('第一种方法')
response = urllib.request.urlopen(url)
print(response.getcode())
print(len(response.read()))
print(response.read())
# print('第二张方法')
# request=urllib.request.request(url)
# request.add_header('user-agent','mozilla/5.0')#爬虫伪装成浏览器
# response1=urllib.request.urlopen(request)
# print(response1.getcode())
# print(len(response1.read()))
# print('第三种方法')
# cj=http.cookiejar
# opener=urllib.build_opener(urllib.HttpCookieProcessor(cj))
# urllib.install_opener(opener)
# response3=urllib.request.urlopen(url)
# print(response3.getcode())
# print(response3.read())
soup = BeautifulSoup()  #html文档字符串,html解析器,html文档编码