Python Soup.find Examples, gazpacho.Soup.find Python Examples

Example #1

0

Show file

File: currency_app.py Project: Mazzart/currency-tracker

def get_currencies_rates(url: str) -> dict:
    """Return dictionary with exchange rates"""

    result: Dict[str, dict] = {}

    response = None

    try:
        response = get(url)
    except (HTTPError, URLError) as err:
        logging.exception(err)

    if response:
        soup = Soup(response)
        currencies = [
            cur.find("a") for cur in soup.find("div", {"class": NAME})
        ]
        buy = soup.find("div", {"class": BUY})
        buy_values = [value.find("div", {"class": NUM}).text for value in buy]
        sale = soup.find("div", {"class": SALE})
        sale_values = [
            value.find("div", {
                "class": NUM
            }).text for value in sale
        ]

        for cur, buy_num, sale_num in zip(currencies, buy_values, sale_values):
            cur = cur.text.split()[-1]
            result[cur] = {"buy": float(buy_num), "sale": float(sale_num)}

    return result

Example #2

0

Show file

File: scraper.py Project: rasPanda/GA-Project4

def scrape():
    url = request.args.get('url')
    html = get(url)
    soup = Soup(html)

    name = (soup.find('meta', attrs={'property': "og:title"}, mode='first'))
    if name != None:
        name = name.attrs['content']
    else:
        name = (soup.find('title', mode='first')).text

    description = soup.find('meta',
                            attrs={'property': "og:description"},
                            mode='first')
    if description != None:
        description = description.attrs['content']
    else:
        description = soup.find('meta',
                                attrs={'name': "description"},
                                mode='first')
        if description != None:
            description = description.attrs['content']

    image = soup.find('meta', attrs={'property': "og:image"}, mode='first')
    if image != None:
        image = image.attrs['content']
    else:
        image = soup.find('meta', attrs={'name': "image"}, mode='first')
        if image != None:
            image = image.attrs['content']

    price = soup.find('meta',
                      attrs={'property': "og:price:amount"},
                      mode='first')
    if price != None:
        price = price.attrs['content']
    else:
        price = soup.find('meta', attrs={'name': "price"}, mode='first')
        if price != None:
            price = price.attrs['content']

    vendor = soup.find('meta',
                       attrs={'property': "og:site_name"},
                       mode='first')
    if vendor != None:
        vendor = vendor.attrs['content']
    else:
        vendor = soup.find('meta', attrs={'name': "site_name"}, mode='first')
        if vendor != None:
            vendor = vendor.attrs['content']

    return {
        "name": name if name != None else '',
        "description": description if description != None else '',
        "image": image if image != None else '',
        "price": price if price != None else '',
        "vendor": vendor if vendor != None else '',
        "dest_url": url
    }, 200

Example #3

0

Show file

def parse_book(book: Soup):
    title = book.find('h4').text
    price = book.find('p').text
    print("____________________________________________")
    # Uncomment to see each book's Soup object
    #print(book)
    print("Title : " + title)
    print("Price : " + price)

Example #4

0

Show file

File: quote.py Project: Haaaam/Naver-Book-Search-Server

def _parse_quote(quote_text: Soup) -> Dict[str, str]:
    b = quote_text.find("a", {"class": "authorOrTitle"}, mode="first")
    a = quote_text.find("span", {"class": "authorOrTitle"}, mode="first")
    q = re.search("(?<=“)(.*?)(?=”)", quote_text.strip())
    return {
        "author": "" if not isinstance(a, Soup) else a.text.replace(",", ""),
        "book": "" if not isinstance(b, Soup) else b.text,
        "quote": "" if not q else q.group(0),
    }

Example #5

0

Show file

def download(player_id):
    url = f'https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020'
    html = get(url)
    soup = Soup(html)
    table = soup.find('table', {'id': "gamelog"})
    df = pd.read_html(str(table))[0]
    df.columns = ['_'.join(col) for col in df.columns]
    df['name'] = soup.find('h1').text
    df['player_id'] = player_id
    meta = soup.find('div', {
        'id': 'meta'
    }).find('p', mode='first').remove_tags()
    df['position'] = meta.split(': ')[1].split(' •')[0]
    return df

Example #6

0

Show file

File: scrape.py Project: strawberrysocialist/DE4DS

def download_player(player_id):
    url = f"https://www.hockey-reference.com/players/{player_id[0]}/{player_id}/gamelog/2020"
    html = get(url)
    soup = Soup(html)
    table = soup.find("table", {"id": "gamelog"})
    df = pd.read_html(str(table))[0]
    df.columns = ["_".join(col) for col in df.columns]
    df["name"] = soup.find("h1").text
    df["player_id"] = player_id
    meta = soup.find("div", {
        "id": "meta"
    }).find("p", mode="first").remove_tags()
    df["position"] = meta.split(": ")[1].split(" •")[0]
    return df

Example #7

0

Show file

def parse(file):
    with open(file, "r") as f:
        html = f.read()
    id = get_id(file)
    soup = Soup(html)
    attributes = soup.find("a", {"href": "/cryptopunks/search?query"},
                           mode="list")
    attributes = [a.text for a in attributes]
    trs = soup.find("tr")[1:]
    transactions = [tr_to_dict(tr) for tr in trs]
    owner = get_owner(soup)
    return dict(id=id,
                owner=owner,
                attributes=attributes,
                transactions=transactions)

Example #8

0

Show file

def paser_selenium(page):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    try:
        browser = Chrome(options=chrome_options)
    except:
        return 'NOMBRE', 'Me fatltan componetes de chrome para sacar el tipo de cuenta.'

    browser.get(page)
    browser.find_element_by_xpath(
        '/html/body/div[2]/div/div/div/div[2]/button[1]').click()

    try:
        tipo = browser.find_element_by_xpath(
            '//*[@id="react-root"]/section/main/div/div/article/div/div/h2')
        tipo = tipo.text
    except:
        tipo = 'Esta cuenta es pública'

    page = browser.page_source
    html_g = Soup(page)
    try:
        nombre = html_g.find('h1', attrs={'class': 'rhpdm'})
        nombre = nombre.text
    except:
        nombre = 'usuario sin nombre'

    return nombre, tipo

Example #9

0

Show file

def get_article_for_packerswire(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div', {'class': 'articleBody'}).find('p')
        return _arranged(lines)
    except Exception:
        return ''

Example #10

0

Show file

File: StooqDownloadPrepare.py Project: JanuszPXYZ/PortfolioMgmtStooqGPW

def get_index_components(query: str = 'https://stooq.pl/q/i/?s=mwig40'):
    '''
    Returns components of an index that is specified for a given
    index (in the case of Polish Stock Exchange (GPW) that would be
    for example mWig40)

    # For the time being, I will set the 'query' argument to download
    # mWig 40 data. Changing this to some other index (WIG20) for example
    # should be pretty straightforward then.

    '''
    htmlContents = []
    companies = []
    html = get(query)
    soup = Soup(html)

    data_table = soup.find('tbody')

    for i in data_table:
        if i.find('font') == None:
            continue
        htmlContents.append(i.find('font'))

    for element in htmlContents[0]:
        if element.find('a') == None:
            continue

        companies.append(re.findall(">(.*)</a>", str(element.find('a')))[0])

    return companies

Example #11

0

Show file

def comprobar_estado_sl(html, user, sl):

    html_s = Soup(html)
    a = html_s.find('meta', attrs={'property': 'og:description'})
    dic = a.attrs
    n = dic['content'].replace('-', ',')
    # arreglar el problema de las ',' en los números
    new_n = n[0]
    for i in range(1, len(n)):
        if n[i - 1].isdigit() and n[i] == ',' and n[i + 1].isdigit():
            new_n += '.'
        else:
            new_n += n[i]
    n = new_n
    n = n.split(',')
    data = (x.strip() for x in n)
    # ('431 Followers', '872 Following', '294 Posts', 'See Instagram photos and videos from JP (@juanpedro)')

    seguidores, seguidos, publicaciones, usuario = data
    *_, usuario = usuario.split(' ')
    # '@juanpedro'
    if '(' in usuario:
        usuario = usuario[1:-1]

    if sl:
        url_user = url + usuario + '/?hl=es'
        nombre, tipo = paser_selenium(url_user)
    else:
        nombre, tipo = '¿nombre?', 'No tienes selenium instaldo, imposible sacar el tipo cuenta...'

    return usuario, nombre, seguidores, seguidos, publicaciones, tipo

Example #12

0

Show file

File: projections.py Project: rush175/sticky

def scrape_position_cbs(position):
    url = f'https://www.cbssports.com/fantasy/hockey/stats/{position}/2019/season/projections/'
    html = get(url)
    soup = Soup(html)
    table = soup.find('table', {'class': 'TableBase-table'})
    pdf = pd.read_html(str(table))[0]
    return pdf

Example #13

0

Show file

File: scrape.py Project: corralm-forks/OReilly__Data_Engineering_for_DS

def get_game_stats(url):
    url += "/stats"
    html = get(url)
    soup = Soup(html)
    rows = soup.find("div", {"class": "BoxScore__statLine"})
    data = [parse_stat_row(row) for row in rows]
    return data

Example #14

0

Show file

def get_article_for_dev_to(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div', {'id': 'article-body'}).find('p')
        return _arranged(lines)
    except Exception:
        return ''

Example #15

0

Show file

File: images.py Project: AlexTheJPEG/ted

    async def explosm(self, ctx):
        async with ctx.channel.typing():
            comic = "https://explosm.net/rcg"

            async with aiohttp.ClientSession() as session:
                async with session.get(comic) as resp:
                    html = await resp.text()
                    comic_soup = Soup(html)

            images = comic_soup.find("div", {"class": "rcg-panels"})
            urls = [img.attrs["src"] for img in images.find("img")]

            filenames = []

            for url in urls:
                filename = f"{ctx.author.id}-{time.time()}.png"
                filenames.append(filename)

                await self.get_image(url, filename)

            final_filename = f"{ctx.author.id}-{time.time()}.png"

            self.combine_images(*filenames, final_filename)

            for filename in filenames:
                os.remove(filename)

            await ctx.reply(file=discord.File(final_filename))

            os.remove(final_filename)

Example #16

0

Show file

def get_article(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('p')
        return _arranged(lines)
    except Exception:
        return ''

Example #17

0

Show file

File: quote.py Project: Haaaam/Naver-Book-Search-Server

def _get_page_quotes(soup: Soup) -> List[Dict[str, str]]:
    quotes = []
    quote_texts = soup.find("div", {"class": "quoteText"}, mode="all")
    assert isinstance(quote_texts, list)
    for quote_text in quote_texts:
        quote = _parse_quote(quote_text)
        quotes.append(quote)
    return quotes

Example #18

0

Show file

def scrape():
    url = "https://www.amazon.ca/Witcher-Nintendo-Switch-Games-Software/dp/B07T4D63YT/"
    browser.get(url)
    html = browser.page_source
    soup = Soup(html)
    price = soup.find("span", {"id": "price"}, partial=True, mode='first').text
    price = float(price.replace("CDN$\xa0", ""))
    return f"The Witcher 3 is ${price} on Amazon.ca right now"

Example #19

0

Show file

File: bookbot.py Project: maxhumber/AWS

def fetch_sale():
    url = "https://scrape.world/books"
    html = get(url)
    soup = Soup(html)
    books_raw = soup.find("div", {"class": "book-"})
    books = [parse(book) for book in books_raw]
    on_sale = [name for name, price in books if price == 0.99]
    return "\n".join(on_sale)

Example #20

0

Show file

def get_article_for_packers(url: str) -> Optional[str]:
    try:
        soup = Soup(get(url))
        lines = soup.find('div',
                          {'class': 'nfl-c-body-part nfl-c-body-part--text'})
        return _arranged(lines)
    except Exception:
        return ''

Example #21

0

Show file

File: projections.py Project: rush175/sticky

def scrape_position_numberfire(position):
    url = f'https://www.numberfire.com/nhl/fantasy/yearly-projections/{position}'
    html = get(url)
    soup = Soup(html)
    tables = soup.find('table', {'class': 'projection-table'})
    names = pd.read_html(str(tables[0]))[0]
    data = pd.read_html(str(tables[1]))[0]
    df = pd.concat([names, data], axis=1)
    return df

Example #22

0

Show file

File: scrape_roller.py Project: koboriakira/njpw-world-search

def scrape():
    from gazpacho import get, Soup
    url = 'https://front.njpwworld.com/search/latest?page=465'
    soup = Soup(get(url))
    movie_areas = soup.find('div', {'class': 'movieArea'})
    links = list(map(lambda m: m.find('a'), movie_areas))
    for link in links:
        movie_id = link[0].attrs['href'].replace('/p/', '')
        url = f'{endpoint_movie}{movie_id}'
        res = requests.post(url)

Example #23

0

Show file

File: scrape.py Project: corralm-forks/OReilly__Data_Engineering_for_DS

def get_boxscore_urls(date):
    if isinstance(date, pd.Timestamp):
        date = date.strftime("%Y-%m-%d")
    url = f"{base}/nba/events/date/{date}"
    html = get(url)
    soup = Soup(html)
    games = soup.find("div", {
        'class': "Layout__content"
    }).find('a', mode='all')
    urls = [base + game.attrs['href'] for game in games]
    return urls

Example #24

0

Show file

def get_game_stats(url):
    url += "/stats"
    html = get(url)
    soup = Soup(html)
    rows = soup.find("div", {"class": "BoxScore__statLine"})
    data = []
    for row in rows:
        try:
            data.append(parse_stat_row(row))
        except:
            pass
    return data

Example #25

0

Show file

File: projections.py Project: rush175/sticky

def scrape_daily_faceoff():
    URL = 'https://www.dailyfaceoff.com/fantasy-hockey-projections/'
    html = get(URL)
    soup = Soup(html)
    df = pd.DataFrame()
    for id in ['igsv', 'igsv-1']:
        table = soup.find(
            'table',
            {'id': f'{id}-1N8XNZpOIb8-6WcOPANqSHRyHBXlwZ6X_1vgGyDbETm4'})
        df = df.append(pd.read_html(str(table))[0])
    df = df.reset_index(drop=True)
    return df

Example #26

0

Show file

File: projections.py Project: rush175/sticky

def yahoo_draft_rankings():
    URL = 'https://www.fantasypros.com/nhl/adp/overall.php'
    html = get(URL)
    soup = Soup(html)
    df = pd.read_html(str(soup.find('table')))[0]
    df[['first', 'last', 'team']] = df['Player Team'].str.split(' ',
                                                                n=2,
                                                                expand=True)
    df['name'] = df['first'] + ' ' + df['last']
    df.columns = [c.lower() for c in df.columns]
    df = df[['name', 'yahoo']]
    return df

Example #27

0

Show file

File: images.py Project: AlexTheJPEG/ted

    async def wikihow(self, ctx):
        async with ctx.channel.typing():
            wikihow_article = "https://wikihow.com/Special:Randomizer"

            async with aiohttp.ClientSession() as session:
                async with session.get(wikihow_article) as resp:
                    html = await resp.text()
                    wikihow_soup = Soup(html)

            images = wikihow_soup.find("li", {"id": "step-id"})

            await ctx.reply(random.choice(images).find("img")[1].attrs["src"])

Example #28

0

Show file

File: player.py Project: scrambldchannel/pycricinfo

 def from_file(cls, html_file: str):
     """
     Load the player html from a previously saved file
     """
     with open(html_file, "r") as f:
         # get player id
         soup = Soup(f.read())
         id = int(
             soup.find("link", attrs={
                 "rel": "canonical"
             }).attrs["href"].split("/")[6].split(".")[0])
     return cls(id=id, html_file=html_file)

Example #29

0

Show file

File: download.py Project: NicKoehler/deegram

async def google_play_link(event: Union[NewMessage.Event, Message]):

    try:
        html = get(event.pattern_match.group(1))

        soup = Soup(html)

        title = soup.find('div', {'class': 'title fade-out'})
        artist = soup.find('div', {'class': 'album-artist fade-out'})

        await event.respond(
            "Vuoi cercare questa canzone? Tocca il tasto qui sotto",
            buttons=[[
                Button.switch_inline(translate.SEARCH_TRACK,
                                     query='%s - %s' %
                                     (title.text, artist.text),
                                     same_peer=True)
            ]])
        raise events.StopPropagation

    except HTTPError:
        pass

Example #30

0

Show file

    def from_files(cls, html_file: str, json_file: str):
        """
        Create series object from offline files
        """
        with open(html_file, "r") as f:
            # get series_id
            soup = Soup(f.read())
            id = int(
                soup.find("link", attrs={
                    "rel": "canonical"
                }).attrs["href"].split("/")[6])

        return cls(id=id, html_file=html_file, json_file=json_file)