def apa(): base_url = 'http://vermont.craigslist.org' full_url = 'http://vermont.craigslist.org/search/apa?query=burlington' return jsonify( base_url=base_url, full_url=full_url, apts=([xmltodict.parse(tos(el)) for el in html(get(full_url).text).xpath("//p[@class='row']")]) )
def scrape_shows(scrape_url: str) -> AbstractSet[Show]: """ :param scrape_url: URL of webpage to scrape for RecurringShows :return: a collection of RecurringShows """ logging.info("Fetching shows data from {}".format(scrape_url)) shows_page = html(http_session.get(scrape_url).content) hrefs = shows_page.xpath("//a[contains(@href, 'artist')]/@href") recurring_shows = [] for href in hrefs: try: recurring_shows.append(Show(href)) except Exception: logging.error('Could not parse Show from %s', href, exc_info=True) return recurring_shows
def scrape_podcast_episodes(scrape_url: str) -> AbstractSet[PodcastEpisode]: """ :param scrape_url: URL of webpage to scrape for IndividualPodcasts :return: a collection of scraped Podcasts """ logging.info("Fetching podcast data from {0}".format(scrape_url)) podcasts_page = html(http_session.get(scrape_url).content) # TODO: scrape more than the front page episodes = [] for div in podcasts_page.xpath('//div[contains(@class, "podcast-list-item")]'): try: episodes.append(PodcastEpisode(div)) except Exception as e: logging.error(e) return episodes
def __init__(self, url, slug=None, name=None, description=None): if not (slug and name and description): logging.info('Scraping Show from <%s>…', url) slug = Show.parse_slug(furl(url)) show_page = html(http_session.get(url).content) try: base_xpath = '/html/body/div[@id="wrapper"]/div[@id="container"]/div[contains(@class, "rounded")]/div' name = show_page.xpath(base_xpath + '/div/h2//text()')[0] description = '\n\n'.join(show_page.xpath(base_xpath + '/div[contains(@class, "entry")]/p//text()')) except IndexError: logging.error("Failed scraping Show name on show page <%s>.", url, exc_info=True) raise logging.info('Scrape Show from <%s>.' % url) self.slug = slug self.name = name self.description = description self.web_url = url
def process(args): images, filepaths, uid = args count = len(filepaths) print 'offlining start', uid for index, filepath in enumerate(filepaths): print 'offline %s/%s (%s)' % (index, count, uid) try: body = html(filepath) except Exception as exc: # error during xml parsing print exc else: imgs = body.xpath('//img') for img in imgs: src = img.attrib['src'] ext = os.path.splitext(src)[1] filename = sha1(src).hexdigest() + ext out = os.path.join(images, filename) # download the image only if it's not already downloaded if not os.path.exists(out): try: download(src, out) except: # do nothing pass else: # update post's html src = '../static/images/' + filename img.attrib['src'] = src # finalize offlining resize(out) optimize(out) # does the post contain images? if so, we surely modified # its content so save it. if imgs: post = html2string(body) with open(filepath, 'w') as f: f.write(post) print 'offlining finished', uid