Beispiel #1
0
def get_aid_page(aid_url):
    try:
        req = urllib.request.Request(aid_url)

        # 设置请求头部信息
        req.add_header("Host", "www.bilibili.com")
        req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0")
        req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3")
        req.add_header("Connection", "keep-alive")
        req.add_header("Accept-Encoding", "gzip, deflate") 

        response = urllib.request.urlopen(req, timeout=10)
        if response.info()["Content-Encoding"] == "gzip":
            buf = io.BytesIO(response.read())
            f = gzip.GzipFile(fileobj=buf)
            aid_page_content = f.read().decode("utf-8")
        else:
            aid_page_content = response.read().decode('utf-8')
        response.close()

        #print(aid_page_content)
        return aid_page_content
    except Exception as e:
        raise e
    finally:
        if response:
            response.close()
  def _GetAuthCookie(self, auth_token):
    """Fetches authentication cookies for an authentication token.

    Args:
      auth_token: The authentication token returned by ClientLogin.

    Raises:
      HTTPError: If there was an error fetching the authentication cookies.
    """

    continue_location = "http://localhost/"
    args = {"continue": continue_location, "auth": auth_token}
    login_path = os.environ.get("APPCFG_LOGIN_PATH", "/_ah")
    req = self._CreateRequest("%s://%s%s/login?%s" %
                              (self.scheme, self.host, login_path,
                               urllib.parse.urlencode(args)))
    try:
      response = self.opener.open(req)
    except urllib.error.HTTPError as e:
      response = e
    if (response.code != 302 or
        response.info()["location"] != continue_location):
      raise urllib.error.HTTPError(req.get_full_url(), response.code, response.msg,
                              response.headers, response.fp)
    self.authenticated = True
 def proxyHttpBrowser(self, url):
     print(url)
     enable_proxy = True
     proxy_handler=urllib.request.ProxyHandler({"http":"http://some-proxy.com:8080"})
     handler = urllib.request.HTTPCookieProcessor(self.cookie)
     if enable_proxy:
         opener = urllib.request.build_opener(proxy_handler)
     else:
         opener = urllib.request.build_opener(handler)
     #安装全局opener urllib.request.urlopen将使用此全局opener
     urllib.request.install_opener(opener)
     req = urllib.request.Request(url)
     try:
         response = urllib.request.urlopen(req)
         #描述了获取的页面情况描述了获取的页面情况 通常是服务器发送的特定头headers
         print(response.info())
         html = response.read()
     except urllib.error.HTTPError as e:
         print('HTTPCODE:', e.code)
         html=None
     except urllib.error.URLError as e:
         print(e.reason)
         html = None
     self.cookie.save(ignore_discard=True, ignore_expires=True)
     return html
Beispiel #4
0
    def _GetAuthCookie(self, auth_token):
        """Fetches authentication cookies for an authentication token.

    Args:
      auth_token: The authentication token returned by ClientLogin.

    Raises:
      HTTPError: If there was an error fetching the authentication cookies.
    """

        continue_location = "http://localhost/"
        args = {"continue": continue_location, "auth": auth_token}
        login_path = os.environ.get("APPCFG_LOGIN_PATH", "/_ah")
        req = self._CreateRequest(
            "%s://%s%s/login?%s" %
            (self.scheme, self.host, login_path, urllib.parse.urlencode(args)))
        try:
            response = self.opener.open(req)
        except urllib.error.HTTPError as e:
            response = e
        if (response.code != 302
                or response.info()["location"] != continue_location):
            raise urllib.error.HTTPError(req.get_full_url(), response.code,
                                         response.msg, response.headers,
                                         response.fp)
        self.authenticated = True
Beispiel #5
0
    def annotateResponse(self, response):
        info = response.info()
        try:
            mimeType, sep, mimeInfo = info["Content-Type"].partition(";")
            m = self.charsetRE.search(mimeInfo)
            if m is not None:
                encoding = m.group(1)
            else:
                encoding = None
            mimeType = mimeType.strip()
        except AttributeError:
            mimeType = "unknown/unknown"
            encoding = None
        except KeyError:
            mimeType = "unknown/unknown"
            encoding = None

        try:
            response.handler = self.mimeMap[mimeType]
        except KeyError:
            print("fallback")
            for glob, handler in self.globMap:
                if fnmatch(mimeType, glob):
                    response.handler = handler
                    break
            else:
                raise URLLookupError("No handler for MIME type: {0}")

        response.mimeType = mimeType
        response.encoding = encoding
        self.bufferResponse(response, info)
        response.url = urllib.parse.urlparse(response.geturl())
Beispiel #6
0
def add_dummy_subscription(data):

    rest_handler()

    request_url = top_level_url + "api/subscription"
    req = urllib.request.Request(request_url)

    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(data)
    jsondataasbytes = jsondata.encode('utf-8')  # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))

    try:
        response = urllib.request.urlopen(req, jsondataasbytes)
        answer = json.loads(response.read().decode(
            response.info().get_param('charset') or 'utf-8'))

    except HTTPError as e:
        print('add_dummy_subscription HTTPError code: ', e.code)
        return None
    except URLError as e:
        print('add_dummy_subscription URLError Reason: ', e.reason)
        return None
    else:
        print(answer['data'][0])
        return answer['data'][0]
Beispiel #7
0
def post_register_user(userdata, url, callback):

    rest_handler_with_username("", "")

    request_url = top_level_url + "api/userregister"
    req = urllib.request.Request(request_url)

    req.add_header('Content-Type', 'application/json; charset=utf-8')
    jsondata = json.dumps(userdata)
    jsondataasbytes = jsondata.encode('utf-8')  # needs to be bytes
    req.add_header('Content-Length', len(jsondataasbytes))

    try:
        response = urllib.request.urlopen(req, jsondataasbytes)
        answer = json.loads(response.read().decode(
            response.info().get_param('charset') or 'utf-8'))

    except HTTPError as e:
        print('post_jsondata HTTPError code: ', e.code)
        return False
    except URLError as e:
        print('post_jsondata URLError Reason: ', e.reason)
        return False
    else:
        print(answer['data'])
        idaccounts = answer['data'][0]['idaccounts']
        post_register_user_image(idaccounts, userdata, url, callback)
        return True
Beispiel #8
0
def fetch_page(page):
    response = urllib.request.urlopen(page)
    if response.info().get('Content-Encoding') == 'gzip':
        response_buffer = StringIO(response.read())
        unzipped_content = gzip.GzipFile(fileobj=response_buffer)
        return unzipped_content.read()
    else:
        return response.read()
Beispiel #9
0
def Start_split(url, client_count):
    url = url
    client_count = client_count
    writepath = 'file.txt'
    mode = 'ab' if os.path.exists(writepath) else 'wb+'
    req = HeadRequest(url)
    response = urllib.request.urlopen(req)
    response.close()
    print("Fileinfo ==>")
    print(response.info())
    strRes = str(response.info())
    contentlength = int(response.getheader("Content-Length"))
    global newLength
    newLength = contentlength
    print("N-Division requests")
    print("\tNo. of clients:", client_count)
    print("\tFileSize in bytes:", contentlength)
    # logging
    app.insert("\nN-Division requests")
    app.insert("\n\tNo. of clients:" + str(client_count))
    app.insert("\n\tFileSize in bytes:" + str(contentlength))
    #seekmer.create(contentlength)
    #print("sample file of content length created")
    urlRangeList = n_division(client_count, contentlength)
    for a in urlRangeList:
        print(a)
        app.insert("\n" + a)
    requests = []
    for x in urlRangeList:
        ss = "urllib.request.Request(" + url + ", headers={'Range':" + x + "})"
        requests.append(ss)
    # pass urlRangeList[i] to the clients_list[i]
    for i in range(client_count):
        clients = [[url, xx] for xx in urlRangeList]

    for test in clients:
        print(test)

    print("done")
    return clients
Beispiel #10
0
def search_google_query(queryurl):
    try:
        response = urllib.request.urlopen(queryurl)

        #Debugging purpose
        #print('\nThe response is :')
        #print(response)

        resp_html = response.read()
        encoding = response.info().get_content_charset('utf-8')
        resp_json = json.loads(resp_html.decode(encoding))
        
        # Once we have the JSON we print the htmlTitle and link of each of the response items using pprint
        print("\nSearch Response from Google are:\n")
        i=0
        while(i<5):
            pprint(resp_json['items'][i]['htmlTitle'])
            pprint(resp_json['items'][i]['link'])
            
            #htmlSnippet provides couple of extra lines on the search item. I have commented this out to provide a
            #bit of clarity.
            #pprint(resp_json['items'][i]['htmlSnippet']) 

            i = i+1
            print('\n\n')
        

        
        # Was using the below to understand the google response to find which objects I need to print out.   
        #pprint (m)
        #fres = open("D:\\Python\\temp.txt", "w+")
        #fres.write(str(m))
        #fres.close()
        #print(html)
        
        
    except urllib.error.URLError as e:
        # Prints any exception that is thrown from urllib.request.urlopen
        print("Code: %d, Reason: %s" % (e.code, e.reason))
    except:
        print("Unexpected Error", sys.exc_info()[0])
        raise
Beispiel #11
0
def get_dummy_accounts():

    rest_handler()

    request_url = top_level_url + "api/dummyaccountslist/"
    req = urllib.request.Request(request_url)

    try:
        response = urllib.request.urlopen(req)
        answer = json.loads(response.read().decode(
            response.info().get_param('charset') or 'utf-8'))

    except HTTPError as e:
        print('get_dummy_accounts Error code: ', e.code)
        return None
    except URLError as e:
        print('get_dummy_accounts Reason: ', e.reason)
        return None
    else:
        return answer['data']
Beispiel #12
0
def get_subscription_number():

    rest_handler()

    request_url = top_level_url + "api/subscriptionnumber"
    req = urllib.request.Request(request_url)

    try:
        response = urllib.request.urlopen(req)
        answer = json.loads(response.read().decode(
            response.info().get_param('charset') or 'utf-8'))

    except HTTPError as e:
        print('delete_user_by_id HTTPError code: ', e.code)
        return None
    except URLError as e:
        print('delete_user_by_id URLError Reason: ', e.reason)
        return None
    else:
        print(answer['data'][0])
        return answer['data'][0]['subscriptionnumber']
Beispiel #13
0
def delete_user_by_id(idaccounts):

    rest_handler()

    request_url = top_level_url + "api/userdatabyid/" + idaccounts
    req = urllib.request.Request(request_url)
    req.method = 'DELETE'

    try:
        response = urllib.request.urlopen(req)
        answer = json.loads(response.read().decode(
            response.info().get_param('charset') or 'utf-8'))

    except HTTPError as e:
        print('delete_user_by_id HTTPError code: ', e.code)
        return False
    except URLError as e:
        print('delete_user_by_id URLError Reason: ', e.reason)
        return False
    else:
        print(answer['data'])
        return True
Beispiel #14
0
def query_weather_info(location, is_str):

    if is_str == True:
        query_url = OWM_URL + "q=" + location.strip() + OWM_API
    else:
        query_url = OWM_URL + "id=" + str(location) + OWM_API

    print("\nThe query url is : {}\n".format(query_url))

    #print("The query url is :"% query_url)
    #The above statement throws this: TypeError: not all arguments converted during string formatting"
    #Find out why.

    try:
        response = urllib.request.urlopen(query_url)
        resp_data = response.read()
        encoding = response.info().get_content_charset('utf-8')
        resp_json = json.loads(resp_data.decode(encoding))
    except urllib.error.URLError as e:
        print("\nError\n------\nException occurred!! Code: {} Reason: {}\n".
              format(e.code, e.reason))
        pass
    return resp_json
Beispiel #15
0
def get_submit_videos_page(url):
    try:
        req = urllib.request.Request(url)

        # 设置请求头部信息
        req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3")
        req.add_header("Connection", "keep-alive")
        req.add_header("Host", "space.bilibili.com")
        req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0")
        req.add_header("Accept-Encoding", "gzip, deflate, br")

        # 请求参数
        params = {
            "mid": "18199039",
            "page": "1",
            "pagesize": "100"
        }
        data = urllib.parse.urlencode(params).encode("utf-8")

        response = urllib.request.urlopen(req, data, timeout=10)
        if response.info()["Content-Encoding"] == "gzip":
            buf = io.BytesIO(response.read())
            f = gzip.GzipFile(fileobj=buf)
            page_content = f.read().decode("utf-8")
        else:
            page_content = response.read().decode("utf-8")
        response.close()

        #print(page_content)
        return page_content
    except Exception as e:
        raise e
    finally:
        if response:
            response.close()
Beispiel #16
0
    def crawl(self, link):

        tryOnce = 0
        robotParser = self.setupRobotParser(link)
        if robotParser.can_fetch("*", link):
            while True:
                try:
                    response = urllib.request.urlopen(link)
                    break
                except urllib.error.HTTPError as e:
                    if e.code == 429:
                        if tryOnce == 1:
                            print(
                                'Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' returning.')
                            return
                        print('Thread ' + str(self.crawlerID) + ': Too many requests: ' + link + ' trying again in 120 seconds.')
                        sleep(120)
                        tryOnce = 1
                    else:
                        return
                # for handling any other url errors
                except:
                    print('Error opening link: ',link, " by thread : ", self.crawlerID)

                    return

            returnedLink = response.geturl()
            if returnedLink != link:
                print('Thread ' + str(self.crawlerID) + ': Redirection:' + link + ' to ' + returnedLink + ' returning.')
                return

            urlInfo = response.info()
            dataType = urlInfo.get_content_type()
            if 'html' not in dataType:
                print('Thread ' + str(self.crawlerID) + ': Not HTML ' + link + ' returning.')
                return

            try:
                webContent = response.read().decode(response.headers.get_content_charset('utf-8'))
            except:
                print("Incomplete Read of web content due to a defective http server.")
                webContent = None

            if(webContent):
                Crawler.webpagesLock.acquire()
                if Crawler.webpagesSaved < NUMOFPAGES:
                    Crawler.webpagesSaved += 1
                else:
                    print('Thread ' + str(self.crawlerID) + ': Page number limit reached ')
                    Crawler.webpagesLock.release()
                    return
                Crawler.webpagesLock.release()
                selector = None
                while True:
                    try:
                        selector = WebPages.select().where(WebPages.pageURL == returnedLink).exists()
                        break
                    except (OperationalError , sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                    except:
                        break

                if selector:
                    print('Thread ' + str(self.crawlerID) + ': Updating webpage ' + link)

                    while True:
                        try:
                            WebPages.update(pageContent=webContent).where(
                                WebPages.pageURL == returnedLink).execute()
                            break
                        except (OperationalError, sqlite3.OperationalError) as e:
                            if 'binding' in str(e):
                                break
                            print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable')
                        except:
                            break

                else:
                    print('Thread ' + str(self.crawlerID) + ': Saving webpage ' + link )
                    try:
                        inserted = False
                        while True:
                            try:
                                if not inserted:
                                    WebPages(pageURL=returnedLink, pageContent=webContent).save()
                                    inserted =  True
                                ...
                                PageRank.create(pageURL=returnedLink).update()
                                ...
                                break
                            except (OperationalError, sqlite3.OperationalError) as e:
                                if 'binding' in str(e):
                                    break
                                print('Thread ', self.crawlerID, ': Database busy, retrying. WebPagesTable & PageRank')
                                sleep(randint(1,5))

                            except:
                                break
                    #should never happen
                    except:
                        print('UnexpectedException: In saving webpage WEEEEEEEEEEEEEEEEEEEEEEE')

                print('Thread ' + str(self.crawlerID) + ': Done saving webpage and starting link extraction ' + link)
                try:
                    parser = MyHTMLParser(link)
                    parser.feed(str(webContent))
                #should never happen
                except:
                    print('UnexpectedException: in parser WEEEEEEEEEEEEEEEEEEEEEEE')

                size = 999
                while True:
                    try:
                        for i in range(0, len(parser.links), size):
                            UncrawledTable.insert_many(parser.links[i:i + size]).upsert().execute()
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. UnCrawledTable')
                    except:
                        break

                while True:
                    try:
                        print("UNCRAWLED URLS = ", UncrawledTable.select().count(), ' Thread ' + str(self.crawlerID))
                        break
                    except (OperationalError, sqlite3.OperationalError) as e:
                        if 'binding' in str(e):
                            break
                        print('Thread ', self.crawlerID, ': Database busy, retrying. print UnCrawledTable')
                    except:
                        break

                print('Thread ' + str(self.crawlerID) + ': Done inserting links ' + link)
Beispiel #17
0
#     #     queryval = {'q': 'hello'}
#     #     querystr = urllib.parse.urlencode(queryval)
#     #     url = url + '?' + querystr
#     #     req = urllib.request.Request(url, None, headers)
#     #     # with urllib.request.urlopen(req) as response:
#     #     #     rtnpage = response.read().decode('utf-8')
#     #     #     print(rtnpage)
#     #     try:
#     #         urllib.request.urlopen(req)
#     #     except urllib.error.HTTPError as e:
#     #         print(e.code)
#     #         print(e.read())
if __name__ == '__main__':
    with urllib.request.urlopen('http://python.org') as response:
        print('real url---', response.geturl())
        for k, v in response.info().items():
            print(k, '==', v)
        html = response.read()
        import subprocess
        import os.path

        if os.path.exists('tmp.html') is True and os.path.isfile(
                'tmp.html') is True:
            print('remove tmp.html')
            os.remove('tmp.html')
        rtnv = subprocess.check_output(['touch', 'tmp.html'])
        with open('./tmp.html', 'r+') as filehandle:
            for line in html.splitlines():
                filehandle.write(line.decode('utf-8'))
                filehandle.write('\n')
                filehandle.seek(1)
# encoding=UTF-8

import urllib.request as request
import urllib.response
import json
import sys

GossipingIndexUrl = "https://www.ptt.cc/bbs/Gossiping/index.html"

#httpsConOpener = urllib.request.build_opener()
#response = httpsConOpener.open(GossipingIndexUrl)
response = urllib.request.urlopen(GossipingIndexUrl)

resp_url = response.geturl()
#resp_info = response.info()
#resp_content = response.read().decode("UTF-8")

#print(resp_url)
#print(resp_info)
#print(resp_content)

if "ask/over18" in resp_url:
	print("ooops")
	post_req =request.Request(resp_url, b"yes:yes")
	response = request.urlopen(post_req)
	print(response.geturl())
	print(response.info())
	print(response.read())

print("end")
Beispiel #19
0
import re
import requests

payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post('http://httpbin.org/post', payload)
print("#No.0001:")
print(ret.text)

url = 'https://www.baidu.com/'

req = request.Request(url)
response = request.urlopen(req)
print("#No.1==>type of response:")
content = response.read()
con1 = response.readlines()
con2 = response.info()
con3 = response.getcode()
con4 = response.geturl()
print(content)
print(con1, "\n", con2, "\n", con3, "\n", con4, "\n")

url2 = 'http://blog.csdn.net/ritterliu/article/details/70243112'
req2 = request.Request(url2)
response2 = request.urlopen(req2)
content2 = BeautifulSoup(response2.read(), "html5lib")

print("#No.2==>", content2.title)
print("#No.3==>", content2.find_all(name='h1'))

namelist = content2.find_all(name='img')
print("#No.4==>")
#!/usr/bin/python3
''' takes in a URL, sends a request to the URL '''

if __name__ == "__main__":
    from urllib import request, response
    from sys import argv

    req = request.Request(argv[1])
    with request.urlopen(req) as response:
        respuesta = response.info()
        print(respuesta["X-Request-Id"])