Example #1
0
def apa():
    base_url = 'http://vermont.craigslist.org'
    full_url = 'http://vermont.craigslist.org/search/apa?query=burlington'
    return jsonify(
        base_url=base_url,
        full_url=full_url,
        apts=([xmltodict.parse(tos(el)) for el in html(get(full_url).text).xpath("//p[@class='row']")])
    )
Example #2
0
def scrape_shows(scrape_url: str) -> AbstractSet[Show]:
    """
    :param scrape_url: URL of webpage to scrape for RecurringShows
    :return: a collection of RecurringShows
    """
    logging.info("Fetching shows data from {}".format(scrape_url))
    shows_page = html(http_session.get(scrape_url).content)
    hrefs = shows_page.xpath("//a[contains(@href, 'artist')]/@href")
    recurring_shows = []
    for href in hrefs:
        try:
            recurring_shows.append(Show(href))
        except Exception:
            logging.error('Could not parse Show from %s', href, exc_info=True)
    return recurring_shows
Example #3
0
def scrape_podcast_episodes(scrape_url: str) -> AbstractSet[PodcastEpisode]:
    """
    :param scrape_url: URL of webpage to scrape for IndividualPodcasts
    :return: a collection of scraped Podcasts
    """
    logging.info("Fetching podcast data from {0}".format(scrape_url))
    podcasts_page = html(http_session.get(scrape_url).content)
    # TODO: scrape more than the front page
    episodes = []
    for div in podcasts_page.xpath('//div[contains(@class, "podcast-list-item")]'):
        try:
            episodes.append(PodcastEpisode(div))
        except Exception as e:
            logging.error(e)
    return episodes
Example #4
0
    def __init__(self, url, slug=None, name=None, description=None):
        if not (slug and name and description):
            logging.info('Scraping Show from <%s>…', url)
            slug = Show.parse_slug(furl(url))
            show_page = html(http_session.get(url).content)
            try:
                base_xpath = '/html/body/div[@id="wrapper"]/div[@id="container"]/div[contains(@class, "rounded")]/div'
                name = show_page.xpath(base_xpath + '/div/h2//text()')[0]
                description = '\n\n'.join(show_page.xpath(base_xpath + '/div[contains(@class, "entry")]/p//text()'))
            except IndexError:
                logging.error("Failed scraping Show name on show page <%s>.", url, exc_info=True)
                raise
            logging.info('Scrape Show from <%s>.' % url)

        self.slug = slug
        self.name = name
        self.description = description
        self.web_url = url
Example #5
0
def process(args):
    images, filepaths, uid = args
    count = len(filepaths)
    print 'offlining start', uid
    for index, filepath in enumerate(filepaths):
        print 'offline %s/%s (%s)' % (index, count, uid)
        try:
            body = html(filepath)
        except Exception as exc:  # error during xml parsing
            print exc
        else:
            imgs = body.xpath('//img')
            for img in imgs:
                src = img.attrib['src']
                ext = os.path.splitext(src)[1]
                filename = sha1(src).hexdigest() + ext
                out = os.path.join(images, filename)
                # download the image only if it's not already downloaded
                if not os.path.exists(out):
                    try:
                        download(src, out)
                    except:
                        # do nothing
                        pass
                    else:
                        # update post's html
                        src = '../static/images/' + filename
                        img.attrib['src'] = src
                        # finalize offlining
                        resize(out)
                        optimize(out)
            # does the post contain images? if so, we surely modified
            # its content so save it.
            if imgs:
                post = html2string(body)
                with open(filepath, 'w') as f:
                    f.write(post)
    print 'offlining finished', uid