Python html Examples

Programming Language: Python

Namespace/Package Name: lxml.html

Method/Function: html

Examples at hotexamples.com: 5

Python html - 5 examples found. These are the top rated real world Python examples of lxml.html.html extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: app.py Project: seanbehan/craigslist-apa

def apa():
    base_url = 'http://vermont.craigslist.org'
    full_url = 'http://vermont.craigslist.org/search/apa?query=burlington'
    return jsonify(
        base_url=base_url,
        full_url=full_url,
        apts=([xmltodict.parse(tos(el)) for el in html(get(full_url).text).xpath("//p[@class='row']")])
    )

Example #2

Show file

def scrape_shows(scrape_url: str) -> AbstractSet[Show]:
    """
    :param scrape_url: URL of webpage to scrape for RecurringShows
    :return: a collection of RecurringShows
    """
    logging.info("Fetching shows data from {}".format(scrape_url))
    shows_page = html(http_session.get(scrape_url).content)
    hrefs = shows_page.xpath("//a[contains(@href, 'artist')]/@href")
    recurring_shows = []
    for href in hrefs:
        try:
            recurring_shows.append(Show(href))
        except Exception:
            logging.error('Could not parse Show from %s', href, exc_info=True)
    return recurring_shows

Example #3

Show file

def scrape_podcast_episodes(scrape_url: str) -> AbstractSet[PodcastEpisode]:
    """
    :param scrape_url: URL of webpage to scrape for IndividualPodcasts
    :return: a collection of scraped Podcasts
    """
    logging.info("Fetching podcast data from {0}".format(scrape_url))
    podcasts_page = html(http_session.get(scrape_url).content)
    # TODO: scrape more than the front page
    episodes = []
    for div in podcasts_page.xpath('//div[contains(@class, "podcast-list-item")]'):
        try:
            episodes.append(PodcastEpisode(div))
        except Exception as e:
            logging.error(e)
    return episodes

Example #4

Show file

    def __init__(self, url, slug=None, name=None, description=None):
        if not (slug and name and description):
            logging.info('Scraping Show from <%s>…', url)
            slug = Show.parse_slug(furl(url))
            show_page = html(http_session.get(url).content)
            try:
                base_xpath = '/html/body/div[@id="wrapper"]/div[@id="container"]/div[contains(@class, "rounded")]/div'
                name = show_page.xpath(base_xpath + '/div/h2//text()')[0]
                description = '\n\n'.join(show_page.xpath(base_xpath + '/div[contains(@class, "entry")]/p//text()'))
            except IndexError:
                logging.error("Failed scraping Show name on show page <%s>.", url, exc_info=True)
                raise
            logging.info('Scrape Show from <%s>.' % url)

        self.slug = slug
        self.name = name
        self.description = description
        self.web_url = url

Example #5

Show file

File: sotoki.py Project: dattaz/sotoki

def process(args):
    images, filepaths, uid = args
    count = len(filepaths)
    print 'offlining start', uid
    for index, filepath in enumerate(filepaths):
        print 'offline %s/%s (%s)' % (index, count, uid)
        try:
            body = html(filepath)
        except Exception as exc:  # error during xml parsing
            print exc
        else:
            imgs = body.xpath('//img')
            for img in imgs:
                src = img.attrib['src']
                ext = os.path.splitext(src)[1]
                filename = sha1(src).hexdigest() + ext
                out = os.path.join(images, filename)
                # download the image only if it's not already downloaded
                if not os.path.exists(out):
                    try:
                        download(src, out)
                    except:
                        # do nothing
                        pass
                    else:
                        # update post's html
                        src = '../static/images/' + filename
                        img.attrib['src'] = src
                        # finalize offlining
                        resize(out)
                        optimize(out)
            # does the post contain images? if so, we surely modified
            # its content so save it.
            if imgs:
                post = html2string(body)
                with open(filepath, 'w') as f:
                    f.write(post)
    print 'offlining finished', uid