Python parse_soup 예제들, cloudbot.util.http.parse_soup Python 예제들

예제 #1

0

파일 보기

파일: link_announcer.py 프로젝트: linuxdaemon/CloudBot

def parse_content(content, encoding=None):
    html = parse_soup(content, from_encoding=encoding)
    old_encoding = encoding

    encoding = get_encoding(html)

    if encoding is not None and encoding != old_encoding:
        html = parse_soup(content, from_encoding=encoding)

    return html

예제 #2

0

파일 보기

파일: pagecheck.py 프로젝트: Vault108/CloudBot-1

def isup(text):
    """<url> - uses isup.me to check if <url> is online or offline

    :type text: str
    """
    url = text.strip()

    # slightly overcomplicated, esoteric URL parsing
    _, auth, path, _, _ = urllib.parse.urlsplit(url)

    domain = auth or path

    try:
        response = requests.get('http://isup.me/' + domain)
        response.raise_for_status()
    except requests.exceptions.ConnectionError:
        return "Failed to get status."
    if response.status_code != requests.codes.ok:
        return "Failed to get status."

    soup = parse_soup(response.text)

    content = soup.find('div', id="domain-main-content").text.strip()

    if "not just you" in content:
        return "It's not just you. {} looks \x02\x034down\x02\x0f from here!".format(
            url)

    if "is up" in content:
        return "It's just you. {} is \x02\x033up\x02\x0f.".format(url)

    return "Huh? That doesn't look like a site on the interweb."

예제 #3

0

파일 보기

파일: horoscope.py 프로젝트: tiredtyrant/CloudBot-1

def parse_page(content):
    """Parse the horoscope page

    >>> parse_page('')
    Traceback (most recent call last):
        [...]
    plugins.horoscope.HoroscopeParseError: Unable to parse horoscope
    >>> parse_page('<div class="main-horoscope"><div>hello world</div></div>')
    Traceback (most recent call last):
        [...]
    plugins.horoscope.HoroscopeParseError: Unable to parse horoscope
    >>> parse_page('<div class="main-horoscope"><p>hello world</p></div>')
    'hello world'

    """
    soup = parse_soup(content)
    container = soup.find("div", class_="main-horoscope")
    if not container:
        raise HoroscopeParseError("Unable to parse horoscope", content)

    para = container.p
    if not para:
        raise HoroscopeParseError("Unable to parse horoscope", content)

    return para.text

예제 #4

0

파일 보기

파일: dogpile.py 프로젝트: tiredtyrant/CloudBot-1

def query(endpoint, text):
    params = {'q': " ".join(text.split())}
    with requests.get(
            search_url + "/" + endpoint, params=params, headers=HEADERS,
            verify=session.verify
    ) as r:
        r.raise_for_status()
        return parse_soup(r.content)

예제 #5

0

파일 보기

파일: myfitnesspal.py 프로젝트: tiredtyrant/CloudBot-1

def mfp(text, reply):
    """<user> - returns macros from the MyFitnessPal food diary of <user>"""
    request = requests.get(scrape_url.format(text))

    try:
        request.raise_for_status()
    except HTTPError as e:
        reply("Failed to fetch info ({})".format(e.response.status_code))
        raise

    if request.status_code != requests.codes.ok:
        return "Failed to fetch info ({})".format(request.status_code)

    output = "Diary for {}: ".format(text)

    try:
        soup = parse_soup(request.text)

        title = soup.find('h1', {'class': 'main-title'})
        if title:
            if title.text == 'This Food Diary is Private':
                return "{}'s food diary is private.".format(text)
            if title.text == 'This Username is Invalid':
                return "User {} does not exist.".format(text)

        # the output of table depends on the user's MFP profile configuration
        headers = get_headers(soup)
        totals = get_values(soup, 'total')
        remaining = get_values(soup, 'alt')

        for idx, val in enumerate(headers['captions']):
            kwargs = {
                'caption': val,
                'total': totals[idx],
                'remain': remaining[idx],
                'units': headers['units'][idx],
                'pct': math.floor((totals[idx] / remaining[idx]) * 100)
            }

            output += ("{caption}: {total}/{remain}{units} ({pct}%) ".format(
                **kwargs))

        output += " ({})".format(scrape_url.format(text))

    except Exception:
        reply("Error parsing results.")
        raise

    return output

예제 #6

0

파일 보기

def RUADICK(text, message):
    """<username> - checks ruadick.com to see if you're a dick on reddit"""
    DickCheck = text.strip()
    dickstatus = requests.get(
        'http://www.ruadick.com/user/{}'.format(DickCheck))
    dickstatus.raise_for_status()
    DickSoup = parse_soup(dickstatus.content)
    Dickstr = str(DickSoup.h2)

    dickstrip = Dickstr.lstrip('<h2>').rstrip('</h2>')

    if dickstrip == 'None':
        message('I can\'t find that user')
    else:
        message('{} {}'.format(dickstrip, dickstatus.url))

예제 #7

0

파일 보기

파일: steam_store.py 프로젝트: Vault108Development/CloudBot

def steam(text, reply):
    """<query> - Search for specified game/trailer/DLC"""
    params = {"term": text.strip().lower()}

    try:
        request = requests.get("http://store.steampowered.com/search/",
                               params=params)
        request.raise_for_status()
    except requests.RequestException as e:
        reply("Could not get game info: {}".format(e))
        raise

    soup = parse_soup(request.text, from_encoding="utf-8")
    result = soup.find("a", {"class": "search_result_row"})

    if not result:
        return "No game found."

    app_id = result["data-ds-appid"]
    return format_game(app_id)

예제 #8

0

파일 보기

async def refresh_fml_cache(loop):
    """ gets a page of random FMLs and puts them into a dictionary """
    url = 'http://www.fmylife.com/random/'
    _func = functools.partial(requests.get, url, timeout=6)
    request = await loop.run_in_executor(None, _func)
    soup = parse_soup(request.text)

    for e in soup.find_all('p', {'class': 'block'}):
        # the /today bit is there to exclude fml news etc.
        a = e.find('a', {'href': re.compile('/article/today')})
        if not a:
            continue

        # the .html in the url must be removed before extracting the id
        fml_id = int(a['href'][:-5].split('_')[-1])
        text = a.text.strip()

        # exclude lengthy submissions and FML photos
        if len(text) > 375 or text[-3:].lower() != "fml":
            continue
        fml_cache.append((fml_id, text))

예제 #9

0

파일 보기

파일: steam_store.py 프로젝트: IlGnome/CloudBot-1

def steam(text, reply):
    """<query> - Search for specified game/trailer/DLC"""
    params = {'term': text.strip().lower()}

    try:
        request = requests.get("http://store.steampowered.com/search/",
                               params=params)
        request.raise_for_status()
    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as e:
        reply("Could not get game info: {}".format(e))
        raise

    soup = parse_soup(request.text, from_encoding="utf-8")
    result = soup.find('a', {'class': 'search_result_row'})

    if not result:
        return "No game found."

    app_id = result['data-ds-appid']
    return format_game(app_id)

예제 #10

0

파일 보기

async def refresh_fml_cache(loop):
    """ gets a page of random FMLs and puts them into a dictionary """
    url = "http://www.fmylife.com/random"
    _func = functools.partial(requests.get, url, timeout=6)
    request = await loop.run_in_executor(None, _func)
    soup = parse_soup(request.text)

    # the /today bit is there to exclude fml news etc.
    articles = soup.find_all("a", {
        "class": "article-link",
        "href": re.compile("/article/today")
    })
    for a in articles:
        # the .html in the url must be removed before extracting the id
        fml_id = int(a["href"][:-5].split("_")[-1])
        text = a.text.strip()

        # exclude lengthy submissions and FML photos
        if len(text) > 375 or text[-3:].lower() != "fml":
            continue
        fml_cache.append((fml_id, text))

예제 #11

0

파일 보기

def xkcd_search(term):
    params = {
        's': term,
        'Search': 'Search',
        'comic': 56,
        'e': 0,
        'n': 0,
        'b': 0,
        'm': 0,
        'd': 0,
        't': 0,
    }
    request = requests.get(str(ONR_URL), params=params)
    request.raise_for_status()
    soup = parse_soup(request.text)
    result = soup.find('li')
    if result:
        url = result.find('div', {'class': 'tinylink'}).text
        xkcd_id = url[:-1].split("/")[-1]
        return xkcd_info(xkcd_id, url=True)

    return "No results found!"

예제 #12

0

파일 보기

def etymology(text, reply):
    """<word> - retrieves the etymology of <word>

    :type text: str
    """

    url = 'http://www.etymonline.com/index.php'

    response = requests.get(url, params={"term": text})

    try:
        response.raise_for_status()
    except HTTPError as e:
        if e.response.status_code == 404:
            return "No etymology found for {} :(".format(text)
        reply("Error reaching etymonline.com: {}".format(
            e.response.status_code))
        raise

    if response.status_code != requests.codes.ok:
        return "Error reaching etymonline.com: {}".format(response.status_code)

    soup = parse_soup(response.text)

    block = soup.find('div', class_=re.compile("word--.+"))

    etym = ' '.join(e.text for e in block.div)

    etym = ' '.join(etym.splitlines())

    etym = ' '.join(etym.split())

    etym = formatting.truncate(etym, 200)

    etym += " Source: " + web.try_shorten(response.url)

    return etym

예제 #13

0

파일 보기

파일: dragonvale.py 프로젝트: tiredtyrant/CloudBot-1

def egg_calculator(text):
    """<time> - Parses dragonvalebreedingguide.com for a list of possible dragons based on the incubation time.
    Enter the time as 5 hours, 30 minutes. For upgraded incubation times put 'upgrade' at the front of the time length
    """
    time = ""
    time2 = ""
    if text.lower().startswith("upgrade"):
        timer = text.replace("upgrade", "")
        time2 = time_parse(timer.strip())
        if not time2:
            return "invalid time format"
    else:
        timer = text
        time = time_parse(timer.strip())
        if not time:
            return "invalid time format"
    params = {'time': time, 'time2': time2, 'avail': 1}
    r = requests.get(egg_calc_url, params=params, timeout=5)
    soup = parse_soup(r.text)
    dragons = []
    for line in soup.findAll('td', {'class': 'views-field views-field-title'}):
        dragons.append(line.text.replace("\n", "").strip())

    return ", ".join(dragons)

예제 #14

0

파일 보기

def amazon(text, reply, _parsed=False):
    """<query> - Searches Amazon for query"""
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, '
        'like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Referer': 'http://www.amazon.com/'
    }
    params = {'url': 'search-alias', 'field-keywords': text.strip()}
    if _parsed:
        # input is from a link parser, we need a specific URL
        request = requests.get(SEARCH_URL.format(_parsed),
                               params=params,
                               headers=headers)
    else:
        request = requests.get(SEARCH_URL.format(REGION),
                               params=params,
                               headers=headers)

    try:
        request.raise_for_status()
    except HTTPError:
        reply("Amazon API error occurred.")
        raise

    soup = parse_soup(request.text)

    # check if there are any results on the amazon page
    results = soup.find('div', {'id': 'atfResults'})
    if not results:
        if not _parsed:
            return "No results found."

        return None

    # get the first item from the results on the amazon page
    results = results.find('ul', {
        'id': 's-results-list-atf'
    }).find_all('li', {'class': 's-result-item'})
    item = results[0]
    asin = item['data-asin']

    # here we use dirty html scraping to get everything we need
    title = formatting.truncate(
        item.find('h2', {
            'class': 's-access-title'
        }).text, 60)
    tags = []

    # tags!
    if item.find('i', {'class': 'a-icon-prime'}):
        tags.append("$(b)Prime$(b)")

    if item.find('i', {'class': 'sx-bestseller-badge-primary'}):
        tags.append("$(b)Bestseller$(b)")

    # we use regex because we need to recognise text for this part
    # the other parts detect based on html tags, not text
    if re.search(
            r"(Kostenlose Lieferung|Livraison gratuite|FREE Shipping|Envío GRATIS"
            r"|Spedizione gratuita)", item.text, re.I):
        tags.append("$(b)Free Shipping$(b)")

    try:
        price = item.find('span', {'class': ['s-price', 'a-color-price']}).text
    except AttributeError:
        for i in item.find_all('sup', class_='sx-price-fractional'):
            i.string.replace_with('.' + i.string)
        price = item.find('span', {'class': 'sx-price'}).text

    # use a whole lot of BS4 and regex to get the ratings
    try:
        # get the rating
        rating = item.find('i', {
            'class': 'a-icon-star'
        }).find('span', {
            'class': 'a-icon-alt'
        }).text
        rating = re.search(r"([0-9]+(?:[.,][0-9])?).*5",
                           rating).group(1).replace(",", ".")
        # get the rating count
        pattern = re.compile(r'(product-reviews|#customerReviews)')
        num_ratings = item.find('a', {'href': pattern}).text.replace(".", ",")
        # format the rating and count into a nice string
        rating_str = "{}/5 stars ({} ratings)".format(rating, num_ratings)
    except AttributeError:
        rating_str = "No Ratings"

    # generate a short url
    if AFFILIATE_TAG:
        url = "http://www.amazon.com/dp/" + asin + "/?tag=" + AFFILIATE_TAG
    else:
        url = "http://www.amazon.com/dp/" + asin + "/"
    url = web.try_shorten(url)

    # join all the tags into a string
    tag_str = " - " + ", ".join(tags) if tags else ""

    # finally, assemble everything into the final string, and return it!
    if not _parsed:
        return colors.parse("".join("$(b){}$(b) ({}) - {}{} - {}".format(
            title, price, rating_str, tag_str, url).splitlines()))

    return colors.parse("".join("$(b){}$(b) ({}) - {}{}".format(
        title, price, rating_str, tag_str).splitlines()))

예제 #15

0

파일 보기

파일: steamdb.py 프로젝트: tiredtyrant/CloudBot-1

def get_data(user, currency="us"):
    """
    Takes a steam user ID and returns a dict containing info about the games the user owns
    :type user: str
    :type currency: str
    :return: dict
    """
    data = {}

    # form the request
    params = {'player': user, 'currency': currency}

    # get the page
    try:
        if cfscrape:
            scraper = cfscrape.create_scraper()
            request = scraper.get(CALC_URL, params=params)
        else:
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, '
                'like Gecko) Chrome/41.0.2228.0 Safari/537.36',
                'Referer':
                'https://steamdb.info/'
            }
            request = requests.get(CALC_URL, params=params, headers=headers)

        request.raise_for_status()
    except (requests.exceptions.HTTPError,
            requests.exceptions.ConnectionError) as e:
        if cfscrape:
            raise SteamError("Could not get user info: {}".format(e))
        else:
            raise SteamError(
                "Could not get user info: {} (You may have been blocked by CloudFlare, try installing the "
                "cfscrape module)".format(e))

    # parse that page!
    soup = parse_soup(request.content)

    # get all the data we need
    try:
        data["name"] = soup.find("h1", {
            "class": "header-title"
        }).find("a").text
        data["url"] = request.url

        data["status"] = soup.find('td', text='Status').find_next('td').text

        data["value"] = soup.find("h1", {"class": "calculator-price"}).text
        data["value_sales"] = soup.find("h1", {
            "class": "calculator-price-lowest"
        }).text

        data["count"] = int(
            soup.find("div", {
                "class": "pull-right price-container"
            }).find("p").find("span", {
                "class": "number"
            }).text.replace(',', ''))

        played = soup.find('td', text='Games not played').find_next('td').text
        played = PLAYED_RE.search(played).groups()

        data["count_unplayed"] = int(played[0])
        data["count_played"] = data["count"] - data["count_unplayed"]

        data["percent_unplayed"] = round(
            percentage(data["count_unplayed"], data["count"]), 1)
        data["percent_played"] = round(
            percentage(data["count_played"], data["count"]), 1)

    except AttributeError:
        raise SteamError("Could not read info, does this user exist?")

    return data