def get_currencies_rates(url: str) -> dict: """Return dictionary with exchange rates""" result: Dict[str, dict] = {} response = None try: response = get(url) except (HTTPError, URLError) as err: logging.exception(err) if response: soup = Soup(response) currencies = [ cur.find("a") for cur in soup.find("div", {"class": NAME}) ] buy = soup.find("div", {"class": BUY}) buy_values = [value.find("div", {"class": NUM}).text for value in buy] sale = soup.find("div", {"class": SALE}) sale_values = [ value.find("div", { "class": NUM }).text for value in sale ] for cur, buy_num, sale_num in zip(currencies, buy_values, sale_values): cur = cur.text.split()[-1] result[cur] = {"buy": float(buy_num), "sale": float(sale_num)} return result
def scrape(): url = request.args.get('url') html = get(url) soup = Soup(html) name = (soup.find('meta', attrs={'property': "og:title"}, mode='first')) if name != None: name = name.attrs['content'] else: name = (soup.find('title', mode='first')).text description = soup.find('meta', attrs={'property': "og:description"}, mode='first') if description != None: description = description.attrs['content'] else: description = soup.find('meta', attrs={'name': "description"}, mode='first') if description != None: description = description.attrs['content'] image = soup.find('meta', attrs={'property': "og:image"}, mode='first') if image != None: image = image.attrs['content'] else: image = soup.find('meta', attrs={'name': "image"}, mode='first') if image != None: image = image.attrs['content'] price = soup.find('meta', attrs={'property': "og:price:amount"}, mode='first') if price != None: price = price.attrs['content'] else: price = soup.find('meta', attrs={'name': "price"}, mode='first') if price != None: price = price.attrs['content'] vendor = soup.find('meta', attrs={'property': "og:site_name"}, mode='first') if vendor != None: vendor = vendor.attrs['content'] else: vendor = soup.find('meta', attrs={'name': "site_name"}, mode='first') if vendor != None: vendor = vendor.attrs['content'] return { "name": name if name != None else '', "description": description if description != None else '', "image": image if image != None else '', "price": price if price != None else '', "vendor": vendor if vendor != None else '', "dest_url": url }, 200
def parse_book(book: Soup): title = book.find('h4').text price = book.find('p').text print("____________________________________________") # Uncomment to see each book's Soup object #print(book) print("Title : " + title) print("Price : " + price)
def _parse_quote(quote_text: Soup) -> Dict[str, str]: b = quote_text.find("a", {"class": "authorOrTitle"}, mode="first") a = quote_text.find("span", {"class": "authorOrTitle"}, mode="first") q = re.search("(?<=“)(.*?)(?=”)", quote_text.strip()) return { "author": "" if not isinstance(a, Soup) else a.text.replace(",", ""), "book": "" if not isinstance(b, Soup) else b.text, "quote": "" if not q else q.group(0), }
def download(player_id): url = f'https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020' html = get(url) soup = Soup(html) table = soup.find('table', {'id': "gamelog"}) df = pd.read_html(str(table))[0] df.columns = ['_'.join(col) for col in df.columns] df['name'] = soup.find('h1').text df['player_id'] = player_id meta = soup.find('div', { 'id': 'meta' }).find('p', mode='first').remove_tags() df['position'] = meta.split(': ')[1].split(' •')[0] return df
def download_player(player_id): url = f"https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020" html = get(url) soup = Soup(html) table = soup.find("table", {"id": "gamelog"}) df = pd.read_html(str(table))[0] df.columns = ["_".join(col) for col in df.columns] df["name"] = soup.find("h1").text df["player_id"] = player_id meta = soup.find("div", { "id": "meta" }).find("p", mode="first").remove_tags() df["position"] = meta.split(": ")[1].split(" •")[0] return df
def parse(file): with open(file, "r") as f: html = f.read() id = get_id(file) soup = Soup(html) attributes = soup.find("a", {"href": "/cryptopunks/search?query"}, mode="list") attributes = [a.text for a in attributes] trs = soup.find("tr")[1:] transactions = [tr_to_dict(tr) for tr in trs] owner = get_owner(soup) return dict(id=id, owner=owner, attributes=attributes, transactions=transactions)
def paser_selenium(page): chrome_options = Options() chrome_options.add_argument("--headless") try: browser = Chrome(options=chrome_options) except: return 'NOMBRE', 'Me fatltan componetes de chrome para sacar el tipo de cuenta.' browser.get(page) browser.find_element_by_xpath( '/html/body/div[2]/div/div/div/div[2]/button[1]').click() try: tipo = browser.find_element_by_xpath( '//*[@id="react-root"]/section/main/div/div/article/div/div/h2') tipo = tipo.text except: tipo = 'Esta cuenta es pública' page = browser.page_source html_g = Soup(page) try: nombre = html_g.find('h1', attrs={'class': 'rhpdm'}) nombre = nombre.text except: nombre = 'usuario sin nombre' return nombre, tipo
def get_article_for_packerswire(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'class': 'articleBody'}).find('p') return _arranged(lines) except Exception: return ''
def get_index_components(query: str = 'https://stooq.pl/q/i/?s=mwig40'): ''' Returns components of an index that is specified for a given index (in the case of Polish Stock Exchange (GPW) that would be for example mWig40) # For the time being, I will set the 'query' argument to download # mWig 40 data. Changing this to some other index (WIG20) for example # should be pretty straightforward then. ''' htmlContents = [] companies = [] html = get(query) soup = Soup(html) data_table = soup.find('tbody') for i in data_table: if i.find('font') == None: continue htmlContents.append(i.find('font')) for element in htmlContents[0]: if element.find('a') == None: continue companies.append(re.findall(">(.*)</a>", str(element.find('a')))[0]) return companies
def comprobar_estado_sl(html, user, sl): html_s = Soup(html) a = html_s.find('meta', attrs={'property': 'og:description'}) dic = a.attrs n = dic['content'].replace('-', ',') # arreglar el problema de las ',' en los números new_n = n[0] for i in range(1, len(n)): if n[i - 1].isdigit() and n[i] == ',' and n[i + 1].isdigit(): new_n += '.' else: new_n += n[i] n = new_n n = n.split(',') data = (x.strip() for x in n) # ('431 Followers', '872 Following', '294 Posts', 'See Instagram photos and videos from JP (@juanpedro)') seguidores, seguidos, publicaciones, usuario = data *_, usuario = usuario.split(' ') # '@juanpedro' if '(' in usuario: usuario = usuario[1:-1] if sl: url_user = url + usuario + '/?hl=es' nombre, tipo = paser_selenium(url_user) else: nombre, tipo = '¿nombre?', 'No tienes selenium instaldo, imposible sacar el tipo cuenta...' return usuario, nombre, seguidores, seguidos, publicaciones, tipo
def scrape_position_cbs(position): url = f'https://www.cbssports.com/fantasy/hockey/stats/{position}/2019/season/projections/' html = get(url) soup = Soup(html) table = soup.find('table', {'class': 'TableBase-table'}) pdf = pd.read_html(str(table))[0] return pdf
def get_game_stats(url): url += "/stats" html = get(url) soup = Soup(html) rows = soup.find("div", {"class": "BoxScore__statLine"}) data = [parse_stat_row(row) for row in rows] return data
def get_article_for_dev_to(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'id': 'article-body'}).find('p') return _arranged(lines) except Exception: return ''
async def explosm(self, ctx): async with ctx.channel.typing(): comic = "https://explosm.net/rcg" async with aiohttp.ClientSession() as session: async with session.get(comic) as resp: html = await resp.text() comic_soup = Soup(html) images = comic_soup.find("div", {"class": "rcg-panels"}) urls = [img.attrs["src"] for img in images.find("img")] filenames = [] for url in urls: filename = f"{ctx.author.id}-{time.time()}.png" filenames.append(filename) await self.get_image(url, filename) final_filename = f"{ctx.author.id}-{time.time()}.png" self.combine_images(*filenames, final_filename) for filename in filenames: os.remove(filename) await ctx.reply(file=discord.File(final_filename)) os.remove(final_filename)
def get_article(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('p') return _arranged(lines) except Exception: return ''
def _get_page_quotes(soup: Soup) -> List[Dict[str, str]]: quotes = [] quote_texts = soup.find("div", {"class": "quoteText"}, mode="all") assert isinstance(quote_texts, list) for quote_text in quote_texts: quote = _parse_quote(quote_text) quotes.append(quote) return quotes
def scrape(): url = "https://www.amazon.ca/Witcher-Nintendo-Switch-Games-Software/dp/B07T4D63YT/" browser.get(url) html = browser.page_source soup = Soup(html) price = soup.find("span", {"id": "price"}, partial=True, mode='first').text price = float(price.replace("CDN$\xa0", "")) return f"The Witcher 3 is ${price} on Amazon.ca right now"
def fetch_sale(): url = "https://scrape.world/books" html = get(url) soup = Soup(html) books_raw = soup.find("div", {"class": "book-"}) books = [parse(book) for book in books_raw] on_sale = [name for name, price in books if price == 0.99] return "\n".join(on_sale)
def get_article_for_packers(url: str) -> Optional[str]: try: soup = Soup(get(url)) lines = soup.find('div', {'class': 'nfl-c-body-part nfl-c-body-part--text'}) return _arranged(lines) except Exception: return ''
def scrape_position_numberfire(position): url = f'https://www.numberfire.com/nhl/fantasy/yearly-projections/{position}' html = get(url) soup = Soup(html) tables = soup.find('table', {'class': 'projection-table'}) names = pd.read_html(str(tables[0]))[0] data = pd.read_html(str(tables[1]))[0] df = pd.concat([names, data], axis=1) return df
def scrape(): from gazpacho import get, Soup url = 'https://front.njpwworld.com/search/latest?page=465' soup = Soup(get(url)) movie_areas = soup.find('div', {'class': 'movieArea'}) links = list(map(lambda m: m.find('a'), movie_areas)) for link in links: movie_id = link[0].attrs['href'].replace('/p/', '') url = f'{endpoint_movie}{movie_id}' res = requests.post(url)
def get_boxscore_urls(date): if isinstance(date, pd.Timestamp): date = date.strftime("%Y-%m-%d") url = f"{base}/nba/events/date/{date}" html = get(url) soup = Soup(html) games = soup.find("div", { 'class': "Layout__content" }).find('a', mode='all') urls = [base + game.attrs['href'] for game in games] return urls
def get_game_stats(url): url += "/stats" html = get(url) soup = Soup(html) rows = soup.find("div", {"class": "BoxScore__statLine"}) data = [] for row in rows: try: data.append(parse_stat_row(row)) except: pass return data
def scrape_daily_faceoff(): URL = 'https://www.dailyfaceoff.com/fantasy-hockey-projections/' html = get(URL) soup = Soup(html) df = pd.DataFrame() for id in ['igsv', 'igsv-1']: table = soup.find( 'table', {'id': f'{id}-1N8XNZpOIb8-6WcOPANqSHRyHBXlwZ6X_1vgGyDbETm4'}) df = df.append(pd.read_html(str(table))[0]) df = df.reset_index(drop=True) return df
def yahoo_draft_rankings(): URL = 'https://www.fantasypros.com/nhl/adp/overall.php' html = get(URL) soup = Soup(html) df = pd.read_html(str(soup.find('table')))[0] df[['first', 'last', 'team']] = df['Player Team'].str.split(' ', n=2, expand=True) df['name'] = df['first'] + ' ' + df['last'] df.columns = [c.lower() for c in df.columns] df = df[['name', 'yahoo']] return df
async def wikihow(self, ctx): async with ctx.channel.typing(): wikihow_article = "https://wikihow.com/Special:Randomizer" async with aiohttp.ClientSession() as session: async with session.get(wikihow_article) as resp: html = await resp.text() wikihow_soup = Soup(html) images = wikihow_soup.find("li", {"id": "step-id"}) await ctx.reply(random.choice(images).find("img")[1].attrs["src"])
def from_file(cls, html_file: str): """ Load the player html from a previously saved file """ with open(html_file, "r") as f: # get player id soup = Soup(f.read()) id = int( soup.find("link", attrs={ "rel": "canonical" }).attrs["href"].split("/")[6].split(".")[0]) return cls(id=id, html_file=html_file)
async def google_play_link(event: Union[NewMessage.Event, Message]): try: html = get(event.pattern_match.group(1)) soup = Soup(html) title = soup.find('div', {'class': 'title fade-out'}) artist = soup.find('div', {'class': 'album-artist fade-out'}) await event.respond( "Vuoi cercare questa canzone? Tocca il tasto qui sotto", buttons=[[ Button.switch_inline(translate.SEARCH_TRACK, query='%s - %s' % (title.text, artist.text), same_peer=True) ]]) raise events.StopPropagation except HTTPError: pass
def from_files(cls, html_file: str, json_file: str): """ Create series object from offline files """ with open(html_file, "r") as f: # get series_id soup = Soup(f.read()) id = int( soup.find("link", attrs={ "rel": "canonical" }).attrs["href"].split("/")[6]) return cls(id=id, html_file=html_file, json_file=json_file)