コード例 #1
0
def _scrape_feed(url, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print "URL:", url
    for a in doc('.span3 li a').items():
        if a.text() == 'RSS':
            feed_url = a.attr('href')
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers['Location']
            if Podcast.objects.filter(url=feed_url).exists():
                # print "ALREADY HAD", feed_url
                continue
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print('Unable to download image for {}'.format(feed_url))
                continue
            except ExpatError:
                print('ExpatError when getting image on {}'.format(feed_url))
                continue
            except NotXMLResponse:
                print(
                    'NotXMLResponse when getting image on {}'.format(feed_url)
                )
                continue
            if not image_url:
                print "Skipping (no image)", feed_url
                continue
            assert '://' in image_url, image_url
            podcast = Podcast.objects.create(
                url=feed_url,
                image_url=image_url,
            )
            return podcast
            # print repr(podcast)
            podcast.download_image()
            podcast.download_episodes()
コード例 #2
0
def _scrape_index(url, verbose=False, max_=1000):
    try:
        html = download(url, gently=True)
    except requests_operational_errors:
        return
    doc = pyquery.PyQuery(html)
    links = doc(".thumbnails a")
    shows = []
    for link in links:
        show_url = link.attrib["href"]
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find("h4"):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list("name", flat=True)

    # XXX might not keep this
    shows = [(n, u) for (n, u) in shows if n not in existing_names]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print("Skipping", name, show_url)
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print("Skipping (no image)", name, rss_url)
            continue
        assert "://" in image_url, image_url

        podcast, created = Podcast.objects.get_or_create(name=name,
                                                         url=rss_url)
        podcast.image_url = image_url
        podcast.save()
        # try:
        #     podcast = Podcast.objects.get(name=name)
        #     podcast.url = rss_url
        #     podcast.image_url = image_url
        #     podcast.save()
        #     created = False
        # except Podcast.DoesNotExist:
        #     assert name, rss_url
        #     podcast = Podcast.objects.create(
        #         name=name,
        #         url=rss_url,
        #         image_url=image_url,
        #     )
        #     created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print("Got an error trying to download the image :(")
                print("IGNORING AND MOVING ON")
            PodcastError.create(podcast)

        if verbose:
            if created:
                print("CREATED")
            else:
                print("NOT NEW")
            print(repr(name))
コード例 #3
0
def _scrape_feed(url, tested_urls, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print("URL:", url)
    for a in doc(".span3 li a").items():
        if a.text() == "RSS":
            feed_url = a.attr("href")
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers["Location"]
            if "://" not in feed_url:
                feed_url = "http://" + feed_url
            if feed_url in tested_urls:
                # We've scraped this one before
                continue
            tested_urls.append(feed_url)
            try:
                podcast = Podcast.objects.get(url=feed_url)
                if podcast.name:
                    continue
            except Podcast.DoesNotExist:
                pass
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print("Unable to download image for {}".format(feed_url))
                continue
            except ExpatError:
                print("ExpatError when getting image on {}".format(feed_url))
                continue
            except NotXMLResponse:
                print(
                    "NotXMLResponse when getting image on {}".format(feed_url))
                continue
            if not image_url:
                print("Skipping (no image)", feed_url)
                continue
            if image_url.startswith("//"):
                if urlparse(feed_url).scheme == "https":
                    image_url = "https:" + image_url
                else:
                    image_url = "http:" + image_url
            assert "://" in image_url, image_url
            podcast, created = Podcast.objects.get_or_create(
                url=feed_url, image_url=image_url)
            if not podcast.name:
                d = feedparser.parse(feed_url)
                print("STATUS?", d.get("status"), feed_url)
                if d.get("status") == 404:
                    print("DELETE {} because of 404 status".format(feed_url))
                    podcast.delete()
                    continue
                if "title" not in d["feed"]:
                    if not d["feed"] and not d["entries"]:
                        print("DELETE {} becuase not title, feed or "
                              "entries".format(feed_url))
                        podcast.delete()
                        continue
                assert d["feed"]["title"], feed_url
                podcast.name = d["feed"]["title"]
                podcast.save()
コード例 #4
0
def _scrape_index(url, verbose=False, max_=1000):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    links = doc('.thumbnails a')
    shows = []
    for link in links:
        show_url = link.attrib['href']
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find('h4'):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list('name', flat=True)

    # XXX might not keep this
    shows = [
        (n, u) for (n, u) in shows
        if n not in existing_names
    ]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print "Skipping", name, show_url
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print "Skipping (no image)", name, rss_url
            continue
        assert '://' in image_url, image_url
        # print "IMAGE_URL", image_url

        try:
            podcast = Podcast.objects.get(name=name)
            podcast.url = rss_url
            podcast.image_url = image_url
            podcast.save()
            created = False
        except Podcast.DoesNotExist:
            podcast = Podcast.objects.create(
                name=name,
                url=rss_url,
                image_url=image_url,
            )
            created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print "Got an error trying to download the image :("
                print "IGNORING AND MOVING ON"
            PodcastError.create(podcast)

        if verbose:
            if created:
                print "CREATED",
            else:
                print "NOT NEW",
            print repr(name)
コード例 #5
0
ファイル: scraper.py プロジェクト: peterbe/django-peterbecom
def _scrape_index(url, verbose=False, max_=1000):
    try:
        html = download(url, gently=True)
    except requests_operational_errors:
        return
    doc = pyquery.PyQuery(html)
    links = doc(".thumbnails a")
    shows = []
    for link in links:
        show_url = link.attrib["href"]
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find("h4"):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list("name", flat=True)

    # XXX might not keep this
    shows = [(n, u) for (n, u) in shows if n not in existing_names]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print("Skipping", name, show_url)
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print("Skipping (no image)", name, rss_url)
            continue
        assert "://" in image_url, image_url

        podcast, created = Podcast.objects.get_or_create(name=name, url=rss_url)
        podcast.image_url = image_url
        podcast.save()
        # try:
        #     podcast = Podcast.objects.get(name=name)
        #     podcast.url = rss_url
        #     podcast.image_url = image_url
        #     podcast.save()
        #     created = False
        # except Podcast.DoesNotExist:
        #     assert name, rss_url
        #     podcast = Podcast.objects.create(
        #         name=name,
        #         url=rss_url,
        #         image_url=image_url,
        #     )
        #     created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print("Got an error trying to download the image :(")
                print("IGNORING AND MOVING ON")
            PodcastError.create(podcast)

        if verbose:
            if created:
                print("CREATED")
            else:
                print("NOT NEW")
            print(repr(name))
コード例 #6
0
ファイル: scraper.py プロジェクト: peterbe/django-peterbecom
def _scrape_feed(url, tested_urls, verbose=False):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    doc.make_links_absolute(get_base_url(url))
    print("URL:", url)
    for a in doc(".span3 li a").items():
        if a.text() == "RSS":
            feed_url = a.attr("href")
            response = requests.head(feed_url)
            if response.status_code in (301, 302):
                feed_url = response.headers["Location"]
            if "://" not in feed_url:
                feed_url = "http://" + feed_url
            if feed_url in tested_urls:
                # We've scraped this one before
                continue
            tested_urls.append(feed_url)
            try:
                podcast = Podcast.objects.get(url=feed_url)
                if podcast.name:
                    continue
            except Podcast.DoesNotExist:
                pass
            try:
                image_url = get_image_url(feed_url)
            except ConnectionError:
                print("Unable to download image for {}".format(feed_url))
                continue
            except ExpatError:
                print("ExpatError when getting image on {}".format(feed_url))
                continue
            except NotXMLResponse:
                print("NotXMLResponse when getting image on {}".format(feed_url))
                continue
            if not image_url:
                print("Skipping (no image)", feed_url)
                continue
            if image_url.startswith("//"):
                if urlparse(feed_url).scheme == "https":
                    image_url = "https:" + image_url
                else:
                    image_url = "http:" + image_url
            assert "://" in image_url, image_url
            podcast, created = Podcast.objects.get_or_create(
                url=feed_url, image_url=image_url
            )
            if not podcast.name:
                d = feedparser.parse(feed_url)
                print("STATUS?", d.get("status"), feed_url)
                if d.get("status") == 404:
                    print("DELETE {} because of 404 status".format(feed_url))
                    podcast.delete()
                    continue
                if "title" not in d["feed"]:
                    if not d["feed"] and not d["entries"]:
                        print(
                            "DELETE {} becuase not title, feed or "
                            "entries".format(feed_url)
                        )
                        podcast.delete()
                        continue
                assert d["feed"]["title"], feed_url
                podcast.name = d["feed"]["title"]
                podcast.save()