def wikiscrape():
    req = urr.Request('https://en.wikipedia.org/wiki/List_of_data_breaches')
    content = urr.urlopen(req)
    cs = content.info().get_content_charset()  # get charset of webpage

    html = content.read().decode(cs)  # consists of html source code

    print("Downloading...")

    soup = BeautifulSoup(html, 'html.parser')

    data = []

    tabclasses = []

    tables = soup.findAll("table")  # "class" : "wikitable"

    for index, tab in enumerate(tables):
        data.append([])
        tabclasses.append(tab.attrs)
        for ind, items in enumerate(tab.find_all("tr")):
            cols = items.find_all(["th", "td"])
            cols = [ele.text.strip() for ele in cols]
            data[index].append([ele for ele in cols if (ele != [])])

    df = pd.DataFrame()

    for rows in data:
        df = df.append(rows)
        df = df.replace(r'\n', '', regex=True)

    print('N dimensions of data {}'.format(df.shape))
    print('Function call complete')
    return df
Ejemplo n.º 2
0
def download_stock_data(csv_url):
    response = request.urlopen(csv_url) #stores the connection in response
    csv = response.read()   #
    csv_str = str(csv)  # csv data converted to string

    lines = csv_str.split("\\n") #breaks the string at new line
    dest_url = r'goog.csv'
Ejemplo n.º 3
0
 def test_urlwithfrag(self):
     urlwith_frag = "https://docs.python.org/2/glossary.html#glossary"
     with support.transient_internet(urlwith_frag):
         req = request.Request(urlwith_frag)
         res = yield from request.urlopen(req)
         self.assertEqual(res.geturl(),
                 "https://docs.python.org/2/glossary.html#glossary")
Ejemplo n.º 4
0
 def test_redirect_url_withfrag(self):
     redirect_url_with_frag = "http://bit.ly/1iSHToT"
     with support.transient_internet(redirect_url_with_frag):
         req = request.Request(redirect_url_with_frag)
         res = yield from request.urlopen(req)
         self.assertEqual(res.geturl(),
                 "https://docs.python.org/3.4/glossary.html#term-global-interpreter-lock")
 def test_iteration(self):
     expected_response = b"pycon 2008..."
     handler = self.start_server([(200, [], expected_response)])
     resp = yield from request.urlopen("http://localhost:%s" % handler.port)
     line = yield from resp.read()
     #for line in data:
     self.assertEqual(line, expected_response)
Ejemplo n.º 6
0
 def download_page(url):
     version = (3, 0)
     cur_version = sys.version_info
     if cur_version >= version:  #If the Current Version of Python is 3.0 or above
         import urllib, request  #urllib library for Extracting web pages
         try:
             headers = {}
             headers[
                 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
             req = urllib, request.Request(url, headers=headers)
             resp = urllib, request.urlopen(req)
             respData = str(resp.read())
             return respData
         except Exception as e:
             print(str(e))
     else:  #If the Current Version of Python is 2.x
         import urllib2
         try:
             headers = {}
             headers[
                 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
             req = urllib2.Request(url, headers=headers)
             response = urllib2.urlopen(req)
             page = response.read()
             return page
         except:
             return "Page Not found"
Ejemplo n.º 7
0
def download_dataset(url, file, path_dataset):
    if not os.path.exists(path_dataset):
        os.makedirs(path_dataset)
    if not os.path.exists(file):
        data = request.urlopen(url).read()
        with open(file, "wb") as f:
            f.write(data)
 def test_basic_auth_success(self):
     ah = request.HTTPBasicAuthHandler()
     ah.add_password(self.REALM, self.server_url, self.USER, self.PASSWD)
     request.install_opener(request.build_opener(ah))
     try:
         _u = yield from request.urlopen(self.server_url)
         self.assertTrue(_u)
     except error.HTTPError:
         self.fail("Basic auth failed for the url: %s", self.server_url)
Ejemplo n.º 9
0
    def test_sites_no_connection_close(self):
        # Some sites do not send Connection: close header.
        # Verify that those work properly. (#issue12576)

        URL = 'http://www.imdb.com' # mangles Connection:close

        with support.transient_internet(URL):
            try:
                res = yield from request.urlopen(URL)
                pass
            except ValueError as e:
                self.fail("urlopen failed for site not sending \
                           Connection:close")
            else:
                self.assertTrue(res)

            req = yield from request.urlopen(URL)
            res = yield from req.read()
            self.assertTrue(res)
Ejemplo n.º 10
0
def download_file(csv_url):
    #check the internet connection
    response = request.urlopen(csv_url)
    csv = response.read()
    csv_str = str(csv)
    lines = csv_str.split("\\n")
    dest_url = r'goo.csv'
    fx = open(dest_url,"w")
    for line in lines:
        fx.write(line + '\n')
    fx.close()
 def test_basic(self):
     handler = self.start_server()
     open_url = yield from request.urlopen("http://localhost:%s" % handler.port)
     for attr in ("read", "close", "info", "geturl"):
         self.assertTrue(hasattr(open_url, attr), "object returned from "
                      "urlopen lacks the %s attribute" % attr)
     try:
         _r = yield from open_url.read()
         self.assertTrue(_r, "calling 'read' failed")
     finally:
         open_url.close()
Ejemplo n.º 12
0
def get_access_token():
    url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=ZrjLfF5Rh7pOL66gaOmDGnXn&client_secret=16bac9645093ca2632ebb81015ff7544"

    req = request.Request(url, method="POST")
    resp = request.urlopen(req)
    data = resp.read().decode('utf-8')
    json_data = json.loads(data)

    global bda_access_token
    bda_access_token = json_data['access_token']

    return bda_access_token
Ejemplo n.º 13
0
def get_access_token():
    url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=ZrjLfF5Rh7pOL66gaOmDGnXn&client_secret=16bac9645093ca2632ebb81015ff7544"

    req = request.Request(url, method="POST")
    resp = request.urlopen(req)
    data = resp.read().decode('utf-8')
    json_data = json.loads(data)

    global bda_access_token
    bda_access_token = json_data['access_token']

    return bda_access_token
def urlopen_with_retry(*args, **kwargs):
    retry_time = 3
    for i in range(retry_time):
        try:
            return request.urlopen(*args, **kwargs)
        except socket.timeout as e:
            if i + 1 == retry_time:
                raise e
        # try to tackle youku CDN fails
        except error.HTTPError as http_error:
            if i + 1 == retry_time:
                raise http_error
Ejemplo n.º 15
0
def label(deqstr):
    import urllib
    import request
    emailstext = []
    for g in deqstr:
        url = s3_root + g
        response = request.urlopen(url)
        raw_text = response.read().decode('utf8')
        emailstext.append(g)
        emailstext.append(raw_text)
        x = "".join(emailstext)
    return x
 def test_info(self):
     handler = self.start_server()
     try:
         open_url = yield from request.urlopen(
             "http://localhost:%s" % handler.port)
         info_obj = open_url.info()
         self.assertIsInstance(info_obj, email.message.Message,
                               "object returned by 'info' is not an "
                               "instance of email.message.Message")
         self.assertEqual(info_obj.get_content_subtype(), "plain")
     finally:
         self.server.stop()
 def test_line_iteration(self):
     lines = [b"We\n", b"got\n", b"here\n", b"verylong " * 192 + b"\n"]
     expected_response = b"".join(lines)
     handler = self.start_server([(200, [], expected_response)])
     resp = yield from request.urlopen("http://localhost:%s" % handler.port)
     data = yield from resp.readlines(4)
     for index, line in enumerate(data):
         self.assertEqual(line, lines[index],
                          "Fetched line number %s doesn't match expected:\n"
                          "    Expected length was %s, got %s" %
                          (index, len(lines[index]), len(line)))
     self.assertEqual(index + 1, len(lines))
 def urlopen(self, url, data=None, **kwargs):
     l = []
     f = yield from request.urlopen(url, data, **kwargs)
     try:
         # Exercise various methods
         #_f = yield from f.readlines(200)
         #l.extend(_f)
         l.append((yield from f.readline()))
         l.append((yield from f.read(1024)))
         l.append((yield from f.read()))
     finally:
         f.close()
     return b"".join(l)
Ejemplo n.º 19
0
def main(WebUrl):
    # globals flist
    if __name__ == "__main__":
        lparser = parserLinks()
        web = request.urlopen(WebUrl)
        # context= web.read()
        for context in web.readlines():
            _str = "%s" % context
            try:
                lparser.feed(_str)
            except parser.HTMLParseError:
                # print( "parser error")
                pass
        web.close()
        imagelist = lparser.getfilelist()
        downjpgmutithread(imagelist)
Ejemplo n.º 20
0
def main(WebUrl):
    # globals flist
    if __name__ == "__main__":
        lparser = parserLinks()
        web = request.urlopen(WebUrl)
        # context= web.read()
        for context in web.readlines():
            _str = "%s" % context
            try:
                lparser.feed(_str)
            except parser.HTMLParseError:
                # print( "parser error")
                pass
        web.close()
        imagelist = lparser.getfilelist()
        downjpgmutithread(imagelist)      
Ejemplo n.º 21
0
def downjpg(filepath, FileName="default.jpg"):
    try:
        FileName = getFileName(filepath)
        web = request.urlopen(filepath)
        print("访问网络文件" + filepath + "\n")
        jpg = web.read()
        DstDir = "G:\\happy_python\\"
        print("保存文件" + DstDir + FileName + "\n")
        try:
            File = open(DstDir + FileName, "wb")
            File.write(jpg)
            File.close()
            return
        except IOError:
            print("error\n")
            return
    except Exception:
        print("error\n")
        return
Ejemplo n.º 22
0
def downjpg(filepath, FileName="default.jpg"):
    try:
        FileName = getFileName(filepath)
        web = request.urlopen(filepath)
        print("访问网络文件" + filepath + "\n")
        jpg = web.read()
        DstDir = "G:\\happy_python\\"
        print("保存文件" + DstDir + FileName + "\n")
        try:
            File = open(DstDir + FileName, "wb")
            File.write(jpg)
            File.close()
            return
        except IOError:
            print("error\n")
            return
    except Exception:
        print("error\n")
        return
Ejemplo n.º 23
0
def read_page(url, page_num, keyword):  # 模仿浏览器post需求信息,并读取返回后的页面信息
    page_headers = {
        'Host':
        'www.lagou.com',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Connection':
        'keep-alive'
    }
    if page_num == 1:
        boo = 'true'
    else:
        boo = 'false'
    page_data = parse.urlencode([  # 通过页面分析,发现浏览器提交的FormData包括以下参数
        ('first', boo), ('pn', page_num), ('kd', keyword)
    ])
    req = request.Request(url, headers=page_headers)
    page = request.urlopen(req, data=page_data.encode('utf-8')).read()
    page = page.decode('utf-8')
    return page
 def test_geturl(self):
     # Make sure same URL as opened is returned by geturl.
     handler = self.start_server()
     open_url = yield from request.urlopen("http://localhost:%s" % handler.port)
     url = open_url.geturl()
     self.assertEqual(url, "http://localhost:%s" % handler.port)
Ejemplo n.º 25
0
def getRequestedTemp():
    return (json.loads(urlopen(Request(requestedTemperature, **requestOpts)).read().decode("utf-8")))["tempF"]
 def test_sending_headers(self):
     handler = self.start_server()
     req = request.Request("http://localhost:%s/" % handler.port,
                                  headers={"Range": "bytes=20-39"})
     yield from request.urlopen(req)
     self.assertEqual(handler.headers_received["Range"], "bytes=20-39")
Ejemplo n.º 27
0

import urllib import request

#可以看到HTTP响应的头和JSON数据
with request.urlopen(''https://api.douban.com/v2/book/2129650') as f:
	data=f.read()
	print('Status:',f.status,f.reason)
	for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', data.decode('utf-8'))
	
#模拟浏览器发送GET请求,就需要使用Request对象,通过往Request对象添加HTTP头	
req = request.Request('http://www.douban.com/')
req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
with request.urlopen(req) as f:
    print('Status:', f.status, f.reason)
    for k, v in f.getheaders():
        print('%s: %s' % (k, v))
    print('Data:', f.read().decode('utf-8'))

	
	
	
	
	
	
Ejemplo n.º 28
0
# add the correct User-Agent
headers = {
    'user-agent':
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}

# the company page you're about to scrape
req = Request('https://angel.co/uber')
web_byte = urlopen(req).read()
company_page = web_byte.decode('utf-8')

#company_page = 'https://mhcdev.com/login'

# open the page
page_request = request.Request(company_page, headers=headers)
page = request.urlopen(page_request)

# parse the html using beautiful soup
html_content = BeautifulSoup(page, 'html.parser')

# we parse the title
title = html_content.find('h1')
title = title.text.strip()
print(title)

# we parse the description
description = html_content.find('h2',
                                attrs={'class': 'js-startup_high_concept'})
description = description.text.strip()
print(description)
Ejemplo n.º 29
0
import urllib
from urllib.request import urlopen
import request
a = urllib.request.urlopen('https://www.baidu.com').read()
print(len(a))
a1 = urlopen('http://m.baidu.com').read()
print(len(a1))
from urllib import request
a2 = request.urlopen('http://jd.com').read()
print(len(a2))
Ejemplo n.º 30
0
# In[7]:


from urllib import request


# In[8]:


dir(request)


# In[9]:


html=request.urlopen(url)


# In[10]:


html


# In[11]:


soup=BeautifulSoup(html)


# In[12]:
Ejemplo n.º 31
0
fronm  urllib import request
url = "http://www.baidu.com"
res = request.urlopen(url)
#  获取相应
print(res.info())
print(res.getcode())
print(res.geturl())
Ejemplo n.º 32
0
def setRequestedTemp(f):
    urlopen(
        Request(url=requestedTemperature, method="PUT", data=json.dumps({"tempF": f}).encode("utf-8"), **requestOpts)
    )
Ejemplo n.º 33
0
import request

resp = request.urlopen('http://www.baidu.com')
print(resp.read())

headers = []