def find_podcasts(url, verbose=False, depth=0): urls = [] hash_ = hashlib.md5(get_base_url(url)).hexdigest() print(url, hash_, depth) if hash_ == '73eb773086aa7f75654f4a2d25ca315b': if not depth: url = url + '/feeds' html = download(url) doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc('h3 a').items(): if a.text() == 'Join Now to Follow': continue # print (a.attr('href'), a.text()) urls.append(a.attr('href')) max_ = 10 random.shuffle(urls) for url in urls[:max_]: try: _scrape_feed( url, verbose=verbose, ) except NotFound: print("WARNING Can't find {}".format(url)) # Now find the next pages if depth < 5: next_urls = [] for a in doc('.pagination a').items(): if '?page=' in a.attr('href'): next_urls.append(a.attr('href')) random.shuffle(urls) for next_url in next_urls[:max_]: for podcast in find_podcasts( next_url, verbose=verbose, depth=depth + 1 ): yield podcast else: html = download(url) doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc('ul.nav ul.dropdown-menu li a'): href = a.attrib['href'] if '/browse/' in href: urls.append(url) max_ = 10 random.shuffle(urls) for url in urls[:max_]: yield _scrape_index( url, verbose=verbose, max_=max_, )
def find_podcasts(url, verbose=False, depth=0, tested_urls=None): urls = [] hash_ = hashlib.md5(get_base_url(url).encode("utf-8")).hexdigest() print((url, hash_, depth)) if tested_urls is None: tested_urls = [] # a mutable if hash_ == "73eb773086aa7f75654f4a2d25ca315b": if not depth: url = url + "/feeds" html = download(url) doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc("h3 a").items(): if a.text() == "Join Now to Follow": continue urls.append(a.attr("href")) max_ = 10 random.shuffle(urls) for url in urls[:max_]: try: _scrape_feed(url, tested_urls, verbose=verbose) except NotFound: print("WARNING Can't find {}".format(url)) # Now find the next pages if depth < 5: next_urls = [] for a in doc(".pagination a").items(): if "?page=" in a.attr("href"): next_urls.append(a.attr("href")) random.shuffle(urls) for next_url in next_urls[:max_]: for podcast in find_podcasts(next_url, verbose=verbose, depth=depth + 1, tested_urls=tested_urls): yield podcast else: try: html = download(url) except ConnectionError: return doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc("ul.nav ul.dropdown-menu li a"): href = a.attrib["href"] if "/browse/" in href: urls.append(url) max_ = 10 random.shuffle(urls) for url in urls[:max_]: yield _scrape_index(url, verbose=verbose, max_=max_)
def find_podcasts(url, verbose=False, depth=0, tested_urls=None): urls = [] hash_ = hashlib.md5(get_base_url(url).encode("utf-8")).hexdigest() print((url, hash_, depth)) if tested_urls is None: tested_urls = [] # a mutable if hash_ == "73eb773086aa7f75654f4a2d25ca315b": if not depth: url = url + "/feeds" html = download(url) doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc("h3 a").items(): if a.text() == "Join Now to Follow": continue urls.append(a.attr("href")) max_ = 10 random.shuffle(urls) for url in urls[:max_]: try: _scrape_feed(url, tested_urls, verbose=verbose) except NotFound: print("WARNING Can't find {}".format(url)) # Now find the next pages if depth < 5: next_urls = [] for a in doc(".pagination a").items(): if "?page=" in a.attr("href"): next_urls.append(a.attr("href")) random.shuffle(urls) for next_url in next_urls[:max_]: for podcast in find_podcasts( next_url, verbose=verbose, depth=depth + 1, tested_urls=tested_urls ): yield podcast else: try: html = download(url) except ConnectionError: return doc = pyquery.PyQuery(html) doc.make_links_absolute(base_url=get_base_url(url)) for a in doc("ul.nav ul.dropdown-menu li a"): href = a.attrib["href"] if "/browse/" in href: urls.append(url) max_ = 10 random.shuffle(urls) for url in urls[:max_]: yield _scrape_index(url, verbose=verbose, max_=max_)
def _scrape_feed(url, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print "URL:", url for a in doc('.span3 li a').items(): if a.text() == 'RSS': feed_url = a.attr('href') response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers['Location'] if Podcast.objects.filter(url=feed_url).exists(): # print "ALREADY HAD", feed_url continue try: image_url = get_image_url(feed_url) except ConnectionError: print('Unable to download image for {}'.format(feed_url)) continue except ExpatError: print('ExpatError when getting image on {}'.format(feed_url)) continue except NotXMLResponse: print( 'NotXMLResponse when getting image on {}'.format(feed_url) ) continue if not image_url: print "Skipping (no image)", feed_url continue assert '://' in image_url, image_url podcast = Podcast.objects.create( url=feed_url, image_url=image_url, ) return podcast # print repr(podcast) podcast.download_image() podcast.download_episodes()
def _scrape_feed(url, tested_urls, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print("URL:", url) for a in doc(".span3 li a").items(): if a.text() == "RSS": feed_url = a.attr("href") response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers["Location"] if "://" not in feed_url: feed_url = "http://" + feed_url if feed_url in tested_urls: # We've scraped this one before continue tested_urls.append(feed_url) try: podcast = Podcast.objects.get(url=feed_url) if podcast.name: continue except Podcast.DoesNotExist: pass try: image_url = get_image_url(feed_url) except ConnectionError: print("Unable to download image for {}".format(feed_url)) continue except ExpatError: print("ExpatError when getting image on {}".format(feed_url)) continue except NotXMLResponse: print( "NotXMLResponse when getting image on {}".format(feed_url)) continue if not image_url: print("Skipping (no image)", feed_url) continue if image_url.startswith("//"): if urlparse(feed_url).scheme == "https": image_url = "https:" + image_url else: image_url = "http:" + image_url assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create( url=feed_url, image_url=image_url) if not podcast.name: d = feedparser.parse(feed_url) print("STATUS?", d.get("status"), feed_url) if d.get("status") == 404: print("DELETE {} because of 404 status".format(feed_url)) podcast.delete() continue if "title" not in d["feed"]: if not d["feed"] and not d["entries"]: print("DELETE {} becuase not title, feed or " "entries".format(feed_url)) podcast.delete() continue assert d["feed"]["title"], feed_url podcast.name = d["feed"]["title"] podcast.save()
def _scrape_feed(url, tested_urls, verbose=False): html = download(url, gently=True) doc = pyquery.PyQuery(html) doc.make_links_absolute(get_base_url(url)) print("URL:", url) for a in doc(".span3 li a").items(): if a.text() == "RSS": feed_url = a.attr("href") response = requests.head(feed_url) if response.status_code in (301, 302): feed_url = response.headers["Location"] if "://" not in feed_url: feed_url = "http://" + feed_url if feed_url in tested_urls: # We've scraped this one before continue tested_urls.append(feed_url) try: podcast = Podcast.objects.get(url=feed_url) if podcast.name: continue except Podcast.DoesNotExist: pass try: image_url = get_image_url(feed_url) except ConnectionError: print("Unable to download image for {}".format(feed_url)) continue except ExpatError: print("ExpatError when getting image on {}".format(feed_url)) continue except NotXMLResponse: print("NotXMLResponse when getting image on {}".format(feed_url)) continue if not image_url: print("Skipping (no image)", feed_url) continue if image_url.startswith("//"): if urlparse(feed_url).scheme == "https": image_url = "https:" + image_url else: image_url = "http:" + image_url assert "://" in image_url, image_url podcast, created = Podcast.objects.get_or_create( url=feed_url, image_url=image_url ) if not podcast.name: d = feedparser.parse(feed_url) print("STATUS?", d.get("status"), feed_url) if d.get("status") == 404: print("DELETE {} because of 404 status".format(feed_url)) podcast.delete() continue if "title" not in d["feed"]: if not d["feed"] and not d["entries"]: print( "DELETE {} becuase not title, feed or " "entries".format(feed_url) ) podcast.delete() continue assert d["feed"]["title"], feed_url podcast.name = d["feed"]["title"] podcast.save()