Ejemplo n.º 1
0
def acessarSite(url):
    try:
        open('https://www.google.com.br')
    except:
        print('Não há internet.')
        exit()
    try:
        open(url)
    except ValueError:
        print('O site é inválido')
    except URLError:
        print('O site não é acessível ou não existe.')
    else:
        print('O site está acessível.')
Ejemplo n.º 2
0
	def _request(self, url: str, method: str = 'GET',
				 payload: bytes = None, content_type: MimeType = None,
				 accept: Sequence[MimeType] = None) -> Response:
		"""General purpose request."""
		request = Request(url=url, sni_hostname=self._sni_hostname,
						  client_cert_path=self._client_cert_path, client_key_path=self._client_key_path,
						  ca_cert_path=self._ca_cert_path,
						  data=payload, method=method)
		if ((payload is not None) and content_type):
			request.add_header('Content-Type', str(content_type))
		if (accept):
			request.add_header('Accept', ', '.join([str(mime_type) for mime_type in accept]))
		if (self.username and self.password):
			request.add_header('Authorization', 'Basic ' + base64.b64encode((self.username + ':' + self.password).encode('utf-8')).decode('ascii'))
		request.add_header('User-Agent', self.user_agent)

		try:
			with request.open() as response:
				return Response(response)
		except urllib.error.HTTPError as error:
			raise NetworkError(error.reason, error.code)
		except urllib.error.URLError as error:
			raise NetworkError(error.reason)
		except Exception as error:
			raise NetworkError(str(error))
Ejemplo n.º 3
0
def shares(year = None):
  url = backtest(year)
  cookie = http.cookiejar.CookieJar()
  request = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
  request.addheaders = [
    ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
    ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')
  ]

  # Aqui estão os parâmetros de busca das ações
  # Estão em branco para que retorne todas as disponíveis
  data = {'pl_min': '', 'pl_max': '', 'pvp_min': '', 'pvp_max' : '', 'psr_min': '', 'psr_max': '', 'divy_min': '', 'divy_max': '', 'pativos_min': '', 'pativos_max': '', 'pcapgiro_min': '', 'pcapgiro_max': '', 'pebit_min': '', 'pebit_max': '', 'fgrah_min': '', 'fgrah_max': '', 'firma_ebit_min': '', 'firma_ebit_max': '', 'margemebit_min': '', 'margemebit_max': '', 'margemliq_min': '', 'margemliq_max': '', 'liqcorr_min': '', 'liqcorr_max': '', 'roic_min': '', 'roic_max': '', 'roe_min': '', 'roe_max': '', 'liq_min': '', 'liq_max': '', 'patrim_min': '', 'patrim_max': '', 'divbruta_min': '', 'divbruta_max': '', 'tx_cresc_rec_min': '', 'tx_cresc_rec_max': '', 'setor': '', 'negociada': 'ON', 'ordem': '1', 'x': '28', 'y': '16'}

  with request.open(url, urllib.parse.urlencode(data).encode('UTF-8')) as link:
      content = link.read().decode('ISO-8859-1')

  pattern = re.compile('<table id="resultado".*</table>', re.DOTALL)
  content = re.findall(pattern, content)[0]
  page = fragment_fromstring(content)
  result = pandas.DataFrame(dataframe_opts(year))

  for rows in page.xpath('tbody')[0].findall("tr"):
      new_row = pandas.DataFrame(index=[rows.getchildren()[0][0].getchildren()[0].text],
                                 data=dataframe_data(rows, year))
      result = result.append(new_row)
  
  return result[result['Cotação'] > 0]
Ejemplo n.º 4
0
def e621_search(md5, username='******'):
    request = urllib.request.build_opener()
    request.addheaders = [('User-agent', 'Yiffdex/0.2a used by ' + username)]

    try:
        f = request.open("https://e621.net/post/show.json?md5=" + md5)
        if f is not None:
            data = json.loads(f.read())

            info = {}
            info['tags'] = data['tags'].split(" ")
            info['author'] = ';'.join(
                data['artist']) if 'artist' in data else ''
            info['src'] = data['source'] if 'source' in data and data[
                'source'] is not None else ''

            return info
    except urllib.error.HTTPError:
        pass
    except urllib.error.URLError:
        pass
    except json.JSONDecodeError:
        pass

    return None
Ejemplo n.º 5
0
def find_link_content(link):
    page = 1
    while True:
        new_link = "http://quotes.toscrape.com" + link + "page/"
        # print(new_link)
        new_link = new_link + str(page)
        print(new_link)
        sub_bs = open(new_link)
        sub_bs = BeautifulSoup(sub_bs, 'html.parser')
        quotes = sub_bs.select('div.row div.col-md-8 span.text')
        # 如果没有数据就退出
        if len(quotes) == 0:
            break
        #名言
        quotes = [quote.text.strip('“”') for quote in quotes]
        #作者
        authors = sub_bs.select('small.author')
        authors = [author.text for author in authors]
        # 标签
        tags_list = sub_bs.select('meta.keywords')
        tags_list = [tags.get('content') for tags in tags_list]
        # print(authors)
        # print(quotes)
        #print(tags_list)
        record_list = []
        for i in range(len(quotes)):
            tags = tags_list[i]
            tags = tags.replace(',', ',')
            print(tags)
            record = [quotes[i], authors[i], tags]
            record_list.append(record)
        insert_into_mysql(record_list)
        page += 1
Ejemplo n.º 6
0
def goto1():

    url = "http://ip-api.com/json/"
    response = open(url + ip)
    data = response.read()
    values = json.loads(data)
    status = values['status']
    success = "success"
    lat = str(values['lat'])
    lon = str(values['lon'])
    a = lat + ","
    b = lon + "/"
    c = b + "data=!3m1!1e3?hl=en"
    location = a + c

    maps = "https://www.google.com/maps/search/"
    webbrowser.open(maps + location)

    print(" IP: " + values['query'])
    print(" Status: " + values['status'])
    print(" city: " + values['city'])
    print(" ISP: " + values['isp'])
    print(" latitude: " + lat)
    print(" longitude: " + lon)
    print(" country: " + values['country'])
    print(" region: " + values['regionName'])
    print(" city: " + values['city'])
    print(" zip: " + values['zip'])
    print(" AS: " + values['as'])
    if status == success:
        speak("sucessfully located")
    else:
        speak("cannot find the location,sir")

    goto2()
Ejemplo n.º 7
0
def find_top_ten(url):
    response = open(url)
    bs = BeautifulSoup(response, 'html.parser')
    tags = bs.select('span.tag-item a')
    top_ten_href = [tag.get('href') for tag in tags]
    top_ten_tag = [tag.text for tag in tags]
    #print(top_ten_href)
    #print(top_ten_tag)
    return top_ten_href
Ejemplo n.º 8
0
def test_scrape(): 
    url = 'http://python.org'
    html = open(url).read()
    html_decoded = html.decode("utf-8")
    title_index = html_decoded.find("<title>")
    start_index = title_index + len("<title>")
    end_index = html_decoded.find("</title>")
    title = html_decoded[start_index: end_index]
    return title 
Ejemplo n.º 9
0
def nameandcolor_scrape():
    url = "http://olympus.realpython.org/profiles/dionysus"
    html = open(url).read().decode("utf-8")
    patterns = ["Name:" ,"Favorite Color: "]
    for string in patterns: 
        start = html.find(string) + len(string)
        tag = start + html[start:].find("<") 
        texts  = html[start : tag]
        print (texts)
Ejemplo n.º 10
0
def scrape_work():
    url = 'http://python.org'
    html = open(url).read()
    html_decoded = html.decode("utf-8")
    pattern = ("<title.*?>.*?</title.*?>")
    match_results = re.search(pattern, html_decoded, re.IGNORECASE)
    title = match_results.group()
    title = re.sub("<.*?>", "", title) # Removes HTML TAGS FROM TITLE 
    return title 
Ejemplo n.º 11
0
    def logout(self):
        request = urllib.request.build_opener()
        request.addheaders = [('User-agent', 'Yiffdex/0.2a')]

        try:
            f = request.open(self.api_url + 'api_logout.php?sid=' + self.sid)
        except urllib.error.HTTPError:
            pass
        except urllib.error.URLError:
            pass
Ejemplo n.º 12
0
    def login(self, username='******', password=''):
        request = urllib.request.build_opener()
        request.addheaders = [('User-agent', 'Yiffdex/0.2a'),
                              ('Content-type', 'multipart/form-data')]

        # Login and get sid
        try:
            params = urllib.parse.urlencode({
                'username': username,
                'password': password
            }).encode('ascii')
            f = request.open(self.api_url + 'api_login.php', params)
            if f is not None:
                data = json.loads(f.read())
                self.sid = data['sid'] if 'sid' in data else ''
        except urllib.error.HTTPError:
            pass
        except urllib.error.URLError:
            pass
        except json.JSONDecodeError:
            pass

        if self.sid == '':
            print('Unable to login on inkbunny !')
            return False

        # Modify rating option
        try:
            params = urllib.parse.urlencode({
                'sid': self.sid,
                'tag[2]': 'yes',
                'tag[3]': 'yes',
                'tag[4]': 'yes',
                'tag[5]': 'yes'
            }).encode('ascii')
            f = request.open(self.api_url + 'api_userrating.php', params)
        except urllib.error.HTTPError:
            pass
        except urllib.error.URLError:
            pass

        return True
Ejemplo n.º 13
0
 def get_files_links(self, urls):
     files = {}
     config = Config.get_config()
     debug = config.debug
     for url in urls:
         request = self.request.get_opener()
         with request.open(url) as f:
             data = f.read().decode("utf-8")
             files[url] = re.findall(r'href=[\'"]?([^\'" >]+)', data, re.UNICODE | re.MULTILINE)
             files[url] = files[url][1:]
             if debug:
                 formatter.debug_message("Found files: {0} for urls {1}".format(files, urls))
     return files
Ejemplo n.º 14
0
    def __init__(self, request):
        """Set up a response object."""
        self.error = False
        try:
            response = request.open()
            self.data = response.read()
            self.status_code = response.getcode()
            self.mimetype = response.info().get_content_type()

        except urllib.error.HTTPError as error:
            self.status_code = error.getcode()
            self.mimetype = error.info().get_content_type()
            self.data = error.read()
            self.error = True
Ejemplo n.º 15
0
 def __request(self, url):
     try:
         request = urllib.request.build_opener()
         request.add_header = [
             ('User-agent',
              'matheushssrobot/site:matheushsoaress.wordpress.com')
         ]
         return request.open(url)
     except urllib.error.ContentTooShortError as e:
         print("Ocorreu um erro de Urllib: " + repr(e))
     except urllib.error.HTTPError as e:
         print("Ocorreu um erro de Urllib: " + repr(e))
     except urllib.error.URLError as e:
         print("Ocorreu um erro de Urllib: " + repr(e))
     except ValueError as e:
         print(repr(e))
     except Exception as e:
         print(repr(e))
         print("Um erro desconhecido ocorreu no metodo Robot.__request")
Ejemplo n.º 16
0
from urllib.request import urlopen as open
from bs4 import BeautifulSoup as bs
soup = bs(open('https://www.naver.com/'))


def find(Obj1, Obj2):
    return soup.find(Obj1, Obj2)


find("div", {"class":"nanan"})
    "https://www.deltakits.com/shop/windshield-repair-products/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/2/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/3/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/4/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/5/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/6/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/7/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/8/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/9/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/10/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/11/",
    "https://www.deltakits.com/shop/windshield-repair-products/page/15/"
]

for url in urls:
    download = open(url)
    page_html = download.read()
    download.close()
    page_soup = soup(page_html, "html.parser")
    containers = page_soup.findAll("", {"class": "product-inner"})

    for container in containers:
        kit_model = container.findAll("div", {"class": "cat-list"})
        kit = kit_model[0].text.strip()

        kit_price = container.findAll("span",
                                      {"class": "woocommerce-Price-amount"})
        price = kit_price[0].text.strip()

        kit_rating = container.findAll("div", {"class": "star-rating"})
        rating = kit_rating[0].text.strip()
Ejemplo n.º 18
0
					req = request.Request('http://javaweb.io/login')
					requestBody = parse.urlencode({
						'name': 'root',
						'pass': '******',
						'verifyCode': verifyCode,
					})
					# 通过该cookie执行登录
					with opener.open(req,data=bytes(requestBody,'utf_8')) as response:
						# 登录ok,因为该cookie已经具备登录凭证,可以进入主页
						with opener.open('http://javaweb.io') as response:
							print(response.read().decode())
		
----------------------------
urllib-HTTPResponse			|
----------------------------
	* Http响应对象,也就是 request.open() 后返回的对象
	* 实例属性
		status
			* HTTP状态码
		reason
			* 信息

	* 方法
		bytes read()
			* 读取响应体

		bytes readline()
			* 读取一行数据

		int getcode()
			* 返回HTTP响应码
Ejemplo n.º 19
0
# import 를 통해 내,외부 라이브러리 사용
"""
import urllib.request
import bs4
url = "https://www.naver.com/"
html = urllib.request.urlopen(url)
bs4.BeautifulSoup(html)
bsObj = bs4.BeautifulSoup(html)
"""

# 코드 줄이기
from urllib.request import urlopen as open
from bs4 import BeautifulSoup as bs
url = "https://www.naver.com/"
bsObj = bs(open(url))


# function 활용
def find(para, para2):
    return bsObj.find(para, para2)


'''
# 1. 전체 크롤링
# print(html.read()) 전체 읽기
# print(bsObj) bs4활용 전체 읽기

# 2. 일부 크롤링 ( 탑 메뉴단 )
# top_Obj = bsObj.find("div", {"class": "service_area"})
# print(top_Obj)
Ejemplo n.º 20
0
					req = request.Request('http://javaweb.io/login')
					requestBody = parse.urlencode({
						'name': 'root',
						'pass': '******',
						'verifyCode': verifyCode,
					})
					# 通过该cookie执行登录
					with opener.open(req,data=bytes(requestBody,'utf_8')) as response:
						# 登录ok,因为该cookie已经具备登录凭证,可以进入主页
						with opener.open('http://javaweb.io') as response:
							print(response.read().decode())
		
----------------------------
urllib-HTTPResponse			|
----------------------------
	* Http响应对象,也就是 request.open() 后返回的对象
	* 实例属性
		status
			* HTTP状态码
		reason
			* 信息

	* 方法
		bytes read()
			* 读取响应体

		bytes readline()
			* 读取一行数据

		int getcode()
			* 返回HTTP响应码
Ejemplo n.º 21
0
def beautifulcrape():
    url = open("http://python.org").read().decode("utf-8")
    bowl = BS(url, "html.parser")
    return bowl
#!/usr/bin/python3
# -*- coding: utf-8 -*-

from urllib.request import urlopen as open

import re

url = "http://quotes.toscrape.com/"

response = open(url)

html = response.read()
html = html.decode("UTF-8")

#  获取 10  个  名言
result = re.findall('<span class="text" itemprop="text">(.*)</span>', html)
print(result)
print(len(result))
true_quotes = []
for single_result in result:
    #  strip 从两边开始搜寻,只要发现某个字符在当前这个方法的范围内,统统去掉
    new_result = single_result.strip("“”.")
    true_quotes.append(new_result)
    # print(new_result)

# 获取 10 个名言的作者
true_authors = []
authors = re.findall('<small class="author" itemprop="author">(.*)</small>',
                     html)
print(len(authors))
for author in authors:
Ejemplo n.º 23
0
                    dest='n',
                    default=2000,
                    type=str,
                    help='Number of repositories per request')
args = parser.parse_args()

request = urllib.request.URLopener()
request.addheader(
    'Authorization',
    args.auth_header,
)

last = ''
repositories = []

while True:
    params = urllib.parse.urlencode({'n': 2000, 'last': last})

    responce = request.open(args.catalog_url + '?' + params, ).read()

    responce_dict = json.loads(str(responce, 'utf-8'))

    if len(responce_dict['repositories']):
        last = responce_dict['repositories'][-1]
        repositories += responce_dict['repositories']
        print('Last repo is: ' + last)
    else:
        break

print(repositories)
Ejemplo n.º 24
0
def mechanicalscrape():
    soup = open("http://python.org").read()
    bowl = MS.Browser()
    spoon = bowl.get(soup)
    return spoon 
Ejemplo n.º 25
0
from urllib.request import urlopen as open

#Customizable tags to search for
url = input("what website would you like to scrape from? ")
decode = input("What decoding method would you like to use? ")
start = input("what starting position would you like to search for? ")
end = input("what ending position would you like to search for? ")

#opening website
html = open(url)
readfile = html.read().decode(f"{decode}")

#gathering the indexes
start_index = readfile.find(f"{start}")
end_index = readfile.find(f"{end}")
startpos = start_index + len(f"{start}")
endpos = end_index + len(f"{end}")

#extracting the information
isScraping = readfile[startpos:endpos]

print("Here is the information you requested!")
print("--------------------------------------")
print(isScraping)
Ejemplo n.º 26
0
from urllib.request import urlopen as open
from tkinter import Tk
import win32com.client as win32

u = open("http://n.sinaimg.cn/news/1_img/dfic/34fa2aa3/62/w1024h638/20190510/4350-hwsffzc2527481.jpg")
x1= win32.gencache.EnsureDispatch('Excel.Application')
ss = x1.Workbooks.Add()
sh = ss.ActiveSheet
x1.Visible = True
sh.Cells(5,10).Value = u
Ejemplo n.º 27
0
import re
import urllib.request as l

city = input("Enter city name: ")

url = "http://www.weather-forecast.com/locations/"+ city + "/forecasts/latest"

data = l.open(url).read()

data1 = data.decode("utf-8")
Ejemplo n.º 28
0
    def search(self, md5):
        request = urllib.request.build_opener()
        request.addheaders = [('User-agent', 'Yiffdex/0.2a'),
                              ('Content-type', 'multipart/form-data')]

        data = None

        # Search file
        try:
            params = urllib.parse.urlencode({
                'sid': self.sid,
                'text': md5,
                'md5': 'yes',
                'submission_ids_only': 'yes',
                'keywords': 'no'
            }).encode('ascii')
            f = request.open(self.api_url + 'api_search.php', params)
            if f is not None:
                data = json.loads(f.read())
        except urllib.error.HTTPError:
            pass
        except urllib.error.URLError:
            pass
        except json.JSONDecodeError:
            pass

        # Not found
        if int(data['results_count_all']) == 0:
            return None

        # Get submission information
        try:
            params = urllib.parse.urlencode({
                'sid':
                self.sid,
                'submission_ids':
                data['submissions'][0]['submission_id']
            }).encode('ascii')
            f = request.open(self.api_url + 'api_submissions.php', params)
            if f is not None:
                data = json.loads(f.read())
        except urllib.error.HTTPError:
            pass
        except urllib.error.URLError:
            pass
        except json.JSONDecodeError:
            pass

        # Not found (protection but not possible)
        if data['results_count'] == 0:
            return None

        info = {}
        info['tags'] = [
            k['keyword_name'] for k in data['submissions'][0]['keywords']
        ]
        info['author'] = data['submissions'][0]['username']
        info['src'] = 'https://inkbunny.net/s/' + data['submissions'][0][
            'submission_id']

        return info
Ejemplo n.º 29
0
from urllib.request import urlopen as open
import json

with open('https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_5m.json') as response:
    states = json.load(response)

import pandas as pd
df = pd.read_csv("covid19_vaccinations_in_the_united_states.csv", dtype={"State": str}, skiprows=3)
df2 = pd.read_csv("united_states_covid19_cases_and_deaths_by_state.csv", skiprows=3)
df.rename(columns={"State/Territory/Federal Entity":"State"}, inplace=True)
df2.rename(columns={"State/Territory/Federal Entity":"State"}, inplace=True)
df.at[42, "State"] = "New York"
df.drop(df.index[5], inplace=True)
df.drop(df.index[9], inplace=True)
df.drop(df.index[17], inplace=True)
df.drop(df.index[25], inplace=True)
df.drop(df.index[52], inplace=True)
df2.loc[38, "Total Cases"] += df2.loc[39, "Total Cases"]
df2.drop(df2.index[39], inplace=True)
df2.drop(df2.index[44], inplace=True)
df2.drop(df2.index[58], inplace=True)
df["Total Cases"] = [total_case for total_case in df2["Total Cases"]]
df["# Vaccinated/Total Cases"] = [vaccinated/total_case for total_case, vaccinated in zip(df["Total Cases"], df["Total Administered"])]

import plotly.express as px

fig = px.choropleth(df, geojson=states, locations="State", color="# Vaccinated/Total Cases", featureidkey="properties.NAME",
                           color_continuous_scale="Bugn",
                           range_color=(0, 1),
                           scope="usa",
                           labels={"Total Administered" : "Vaccines Administered"},