Ejemplo n.º 1
0
def get_aid_page(aid_url):
    try:
        req = urllib.request.Request(aid_url)

        # 设置请求头部信息
        req.add_header("Host", "www.bilibili.com")
        req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0")
        req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3")
        req.add_header("Connection", "keep-alive")
        req.add_header("Accept-Encoding", "gzip, deflate") 

        response = urllib.request.urlopen(req, timeout=10)
        if response.info()["Content-Encoding"] == "gzip":
            buf = io.BytesIO(response.read())
            f = gzip.GzipFile(fileobj=buf)
            aid_page_content = f.read().decode("utf-8")
        else:
            aid_page_content = response.read().decode('utf-8')
        response.close()

        #print(aid_page_content)
        return aid_page_content
    except Exception as e:
        raise e
    finally:
        if response:
            response.close()
Ejemplo n.º 2
0
def spider(url):
    global sleep_time
    request = urllib.request.Request(url)
    # 下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了
    request.add_header('user-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
    while True:
        try:
            time.sleep(sleep_time)
            response = urllib.request.build_opener().open(request)
            if response.getcode() == 200:
                html = response.read()
                response.close()
                if html is not None:
                    if sleep_time > 5:
                        sleep_time -= 1
                    return html
                else:
                    continue
        except urllib.error.URLError as e:
            print(e.reason, ':', url)
        except socket.timeout as e:
            print("-----socket timout:", url)
        except:
            if sleep_time < 20:
                sleep_time += 1
            print('********************do not know why it is happened!*****************')
            print("************************ now sleep time is: %d *********************" % sleep_time )
Ejemplo n.º 3
0
 def test_get_stream_offset(self):
     response = urllib.request.urlopen(
         "http://localhost:8888/stream?stream_time=1529488179412403&offset=15"
     )
     contents = response.read()
     response.close()
     self.check_archive_with_offset(contents, 15, 15)
def Downloader():
    print("Commencing download")
    inputValue = link.get("1.0", "end-1c")
    print(inputValue)
    x = 'bytes=0-'  #getting wrong file size
    remaining_download_tries = 5
    #url="https://download.netbeans.org/netbeans/8.0.2/final/bundles/netbeans-8.0.2-javase-windows.exe"
    #https://www.codesector.com/files/teracopy.exe
    downloadname = str(inputValue.split('/')[-1])  #gives proper filename
    downloadpath = Dpath + downloadname
    print(downloadpath)
    req = urllib.request.Request(inputValue, headers={'Range': x})
    response = urllib.request.urlopen(req)
    response.close()
    contentlength = int(response.getheader("Content-Length"))
    print(contentlength)
    while remaining_download_tries > 0:
        try:
            print("starting download")
            with urllib.request.urlopen(req) as fsrc, open(
                    downloadpath, 'w+b'
            ) as fdst:  #NamedTemporaryFile(delete=False) replace open () with Named..() for temp file download
                copyfileobj(fsrc, fdst, 16 * 1024, contentlength)
                print("complete")
        except:
            remaining_download_tries = remaining_download_tries - 1
            print("retrying download")
            with urllib.request.urlopen(inputValue) as fsrc, open(
                    downloadpath, 'w+b'
            ) as fdst:  #NamedTemporaryFile(delete=False) replace open () with Named..() for temp file download
                copyfileobj(fsrc, fdst, 16 * 1024)
                print("complete")
        finally:
            print("D")
            break
Ejemplo n.º 5
0
 def test_get_frames_empty(self):
     response = urllib.request.urlopen(
         "http://localhost:8888/stream?stream_time=1529488179412403&frame_time=2581101"
     )
     contents = response.read()
     response.close()
     self.check_archive_with_frames(contents, [], 0)
Ejemplo n.º 6
0
 def test_get_frames(self):
     response = urllib.request.urlopen(
         "http://localhost:8888/stream?stream_time=1529488179412403&"
         "frame_time=0&frame_time=630021&frame_time=1210079&frame_time=2581100&frame_time=2581101"
     )
     contents = response.read()
     response.close()
     self.check_archive_with_frames(contents, [0, 7, 18, 29], 4)
Ejemplo n.º 7
0
def get_url_content(website):
    headers = {'Accept-Charset': 'utf-8',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'}
    request = urllib.request.Request(website, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    response.close()
    return html
Ejemplo n.º 8
0
def query(query,
          useragent='python-duckduckgo ' + str(__version__),
          safesearch=True,
          html=False,
          meanings=True,
          **kwargs):
    """
    Query DuckDuckGo, returning a Results object.

    Here's a query that's unlikely to change:

    --- result = query('1 + 1')
    --- result.type
    'nothing'
    --- result.answer.text
    '1 + 1 = 2'
    --- result.answer.type
    'calc'

    Keword arguments:
    useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str)
    safesearch: True for on, False for off. Default: True (bool)
    html: True to allow HTML in output. Default: False (bool)
    meanings: True to include disambiguations in results (bool)
    Any other keyword arguments are passed directly to DuckDuckGo as URL params.
    """

    safesearch = '1' if safesearch else '-1'
    html = '0' if html else '1'
    meanings = '0' if meanings else '1'
    params = {
        'q': query,
        'o': 'json',
        'kp': safesearch,
        'no_redirect': '1',
        'no_html': html,
        'd': meanings,
    }
    params.update(kwargs)
    encparams = urllib.parse.urlencode(params)
    url = 'http://api.duckduckgo.com/?' + encparams

    request = urllib.request.Request(url, headers={'User-Agent': useragent})
    response = urllib.request.urlopen(request)
    json = j.loads(response.read().decode('utf-8'))
    response.close()

    return Results(json)
Ejemplo n.º 9
0
def test_remote():
    while not web_paths.empty():
        path = web_paths.get()
        url = "%s%s" % (target, path)

    request = urllib.request(url)

    try:
        response = urllib.response(request)
        content = response.read()

        print("[%d] => %s" % (response.code, path))
        response.close()

    except:
        print("failed")
        pass
Ejemplo n.º 10
0
def query(query, useragent='python-duckduckgo '+str(__version__), safesearch=True, html=False, meanings=True, **kwargs):
    """
    Query DuckDuckGo, returning a Results object.

    Here's a query that's unlikely to change:

    --- result = query('1 + 1')
    --- result.type
    'nothing'
    --- result.answer.text
    '1 + 1 = 2'
    --- result.answer.type
    'calc'

    Keword arguments:
    useragent: UserAgent to use while querying. Default: "python-duckduckgo %d" (str)
    safesearch: True for on, False for off. Default: True (bool)
    html: True to allow HTML in output. Default: False (bool)
    meanings: True to include disambiguations in results (bool)
    Any other keyword arguments are passed directly to DuckDuckGo as URL params.
    """

    safesearch = '1' if safesearch else '-1'
    html = '0' if html else '1'
    meanings = '0' if meanings else '1'
    params = {
        'q': query,
        'o': 'json',
        'kp': safesearch,
        'no_redirect': '1',
        'no_html': html,
        'd': meanings,
        }
    params.update(kwargs)
    encparams = urllib.parse.urlencode(params)
    url = 'http://api.duckduckgo.com/?' + encparams

    request = urllib.request.Request(url, headers={'User-Agent': useragent})
    response = urllib.request.urlopen(request)
    json = j.loads(response.read().decode('utf-8'))
    response.close()

    return Results(json)
Ejemplo n.º 11
0
def Start_split(url, client_count):
    url = url
    client_count = client_count
    writepath = 'file.txt'
    mode = 'ab' if os.path.exists(writepath) else 'wb+'
    req = HeadRequest(url)
    response = urllib.request.urlopen(req)
    response.close()
    print("Fileinfo ==>")
    print(response.info())
    strRes = str(response.info())
    contentlength = int(response.getheader("Content-Length"))
    global newLength
    newLength = contentlength
    print("N-Division requests")
    print("\tNo. of clients:", client_count)
    print("\tFileSize in bytes:", contentlength)
    # logging
    app.insert("\nN-Division requests")
    app.insert("\n\tNo. of clients:" + str(client_count))
    app.insert("\n\tFileSize in bytes:" + str(contentlength))
    #seekmer.create(contentlength)
    #print("sample file of content length created")
    urlRangeList = n_division(client_count, contentlength)
    for a in urlRangeList:
        print(a)
        app.insert("\n" + a)
    requests = []
    for x in urlRangeList:
        ss = "urllib.request.Request(" + url + ", headers={'Range':" + x + "})"
        requests.append(ss)
    # pass urlRangeList[i] to the clients_list[i]
    for i in range(client_count):
        clients = [[url, xx] for xx in urlRangeList]

    for test in clients:
        print(test)

    print("done")
    return clients
Ejemplo n.º 12
0
    def _process_uncached_url(self, url, no_description=False):
        request = urllib.request.Request(url, headers={
            "User-Agent": self.userAgent,
            "Accept": self.acceptHeader
        })
        try:
            startTime = time.time()
            response = urllib.request.urlopen(request, timeout=self.timeout)
            timeTaken = time.time() - startTime
        except socket.timeout:
            raise URLLookupError("Timed out")
        except urllib.error.URLError as err:
            raise URLLookupError(err.reason)
        except Exception as err:
            raise URLLookupError(type(err).__name__)
        try:

            newURL = response.geturl()
            if newURL != url and self.showRedirects:
                yield "→ <{0}>".format(newURL)

            self.annotateResponse(response)

            if response.contentLength is not None:
                sizeFormatted = formatBytes(response.contentLength)
            else:
                sizeFormatted = "unknown size"

            responseIter = iter(response.handler.processResponse(response, no_description=no_description))
            firstLine = next(responseIter)
            for line in self.responseFormats:
                yield line.format(time=timeTaken,
                        size=sizeFormatted,
                        plugin=firstLine)
            for line in responseIter:
                yield line
        finally:
            response.close()
            del response.buf
            del response
Ejemplo n.º 13
0
def get_submit_videos_page(url):
    try:
        req = urllib.request.Request(url)

        # 设置请求头部信息
        req.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        req.add_header("Accept-Language", "zh-CN,en-US;q=0.7,en;q=0.3")
        req.add_header("Connection", "keep-alive")
        req.add_header("Host", "space.bilibili.com")
        req.add_header("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0")
        req.add_header("Accept-Encoding", "gzip, deflate, br")

        # 请求参数
        params = {
            "mid": "18199039",
            "page": "1",
            "pagesize": "100"
        }
        data = urllib.parse.urlencode(params).encode("utf-8")

        response = urllib.request.urlopen(req, data, timeout=10)
        if response.info()["Content-Encoding"] == "gzip":
            buf = io.BytesIO(response.read())
            f = gzip.GzipFile(fileobj=buf)
            page_content = f.read().decode("utf-8")
        else:
            page_content = response.read().decode("utf-8")
        response.close()

        #print(page_content)
        return page_content
    except Exception as e:
        raise e
    finally:
        if response:
            response.close()
Ejemplo n.º 14
0
    url = 'https://www.baidu.com/s?wd=pycharm%20web%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_spt=1&rsv_iqid=0xb25600d300005801&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=pycharm%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_t=02279cxCOeMjpOzNhCqHcXCv7p2JPX29wSb37PHvoVdhnh1%2FPejpS3fVk84ZcHCYz79c&inputT=595&rsv_pq=966a0ca000007720&rsv_sug3=27&rsv_sug1=7&rsv_sug2=0&rsv_sug7=100&rsv_sug4=1986&rsv_sug=1'
    unquoteurl = unquote(url)
    urlparses = urlparse(unquoteurl)
    print(unquoteurl)

    #print urlparse information
    for info in urlparses:
        print(info)
    '''

    #demo for htmlparser
    '''
    myhtmlparser1 = HtmlParsor()
    myhtmlparser1.feed("<html><title>hello</title><body>hello world!!!</body></html>")
    myhtmlparser1.close()
    '''

    #get url's content
    #myrequest = urllib.request('http://www.4hb.com/letters/ltrdelacct4.html')
    file = "D:\\INFOR.html"

    shp = SelfHtmlParser()
    response = urllib.request.urlopen(
        'http://www.4hb.com/letters/ltrdelacct4.html')
    mycontent = response.read()
    reg = shp(mycontent)
    openfile = open(file, "a+")
    response.close()
    print(reg.title)

    pass
Ejemplo n.º 15
0
def download(url, headers={}):
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    content = response.read().decode('utf-8')
    response.close()
    return content
Ejemplo n.º 16
0
    '''
    url = 'https://www.baidu.com/s?wd=pycharm%20web%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_spt=1&rsv_iqid=0xb25600d300005801&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=pycharm%20tomcat%20%E9%83%A8%E7%BD%B2&rsv_t=02279cxCOeMjpOzNhCqHcXCv7p2JPX29wSb37PHvoVdhnh1%2FPejpS3fVk84ZcHCYz79c&inputT=595&rsv_pq=966a0ca000007720&rsv_sug3=27&rsv_sug1=7&rsv_sug2=0&rsv_sug7=100&rsv_sug4=1986&rsv_sug=1'
    unquoteurl = unquote(url)
    urlparses = urlparse(unquoteurl)
    print(unquoteurl)

    #print urlparse information
    for info in urlparses:
        print(info)
    '''

    #demo for htmlparser
    '''
    myhtmlparser1 = HtmlParsor()
    myhtmlparser1.feed("<html><title>hello</title><body>hello world!!!</body></html>")
    myhtmlparser1.close()
    '''

    #get url's content
    #myrequest = urllib.request('http://www.4hb.com/letters/ltrdelacct4.html')
    file = "D:\\INFOR.html"

    shp = SelfHtmlParser()
    response = urllib.request.urlopen('http://www.4hb.com/letters/ltrdelacct4.html')
    mycontent = response.read()
    reg = shp(mycontent)
    openfile = open(file,"a+")
    response.close()
    print(reg.title)

    pass