Beispiel #1
0
def download_episodes(podcast, verbose=True):
    try:
        _download_episodes(podcast, verbose=verbose)
        if podcast.error:
            Podcast.objects.filter(id=podcast.id).update(error=None)
    except (BadPodcastEntry, NotFound) as exception:
        Podcast.objects.filter(id=podcast.id).update(
            error=unicode(exception)
        )
    except Exception:
        PodcastError.create(podcast)
        raise
Beispiel #2
0
def download_episodes_task(podcast_id, verbose=True):
    try:
        podcast = Podcast.objects.get(id=podcast_id)
    except Podcast.DoesNotExist:
        print("Warning! Podcast with id {} does not exist".format(podcast_id))
        return
    try:
        download_episodes(podcast, verbose=verbose)
    except NotFound as exception:
        PodcastError.create(podcast)
        if isinstance(exception, bytes):
            podcast.error = exception.decode("utf-8")
        else:
            podcast.error = str(exception)
        podcast.save()
Beispiel #3
0
def download_episodes_task(podcast_id, verbose=True):
    try:
        podcast = Podcast.objects.get(id=podcast_id)
    except Podcast.DoesNotExist:
        print("Warning! Podcast with id {} does not exist".format(podcast_id))
        return
    try:
        download_episodes(podcast, verbose=verbose)
    except NotFound as exception:
        PodcastError.create(podcast)
        if isinstance(exception, bytes):
            podcast.error = exception.decode("utf-8")
        else:
            podcast.error = str(exception)
        podcast.save()
Beispiel #4
0
def redownload_podcast_image(podcast_id):
    podcast = Podcast.objects.get(id=podcast_id)
    try:
        podcast.download_image()
        # If it worked, it should be possible to make a thumbnail out of
        # if. I've seen downloaded images with the right content-type,
        # and with a size but when you try to turn it into a thumbnail
        # PIL throws IOErrors.
        assert podcast.image
        try:
            thumbnail(podcast.image, "300x300")
            print("Worked!")
        except IOError:
            print("Not a valid image if thumbnails can't be made")
            podcast.image = None
            podcast.save()
    except Exception:
        print("Failed!")
        PodcastError.create(podcast)
        raise
Beispiel #5
0
def redownload_podcast_image(podcast_id):
    podcast = Podcast.objects.get(id=podcast_id)
    try:
        podcast.download_image()
        # If it worked, it should be possible to make a thumbnail out of
        # if. I've seen downloaded images with the right content-type,
        # and with a size but when you try to turn it into a thumbnail
        # PIL throws IOErrors.
        assert podcast.image
        try:
            thumbnail(podcast.image, "300x300")
            print("Worked!")
        except IOError:
            print("Not a valid image if thumbnails can't be made")
            podcast.image = None
            podcast.save()
    except Exception:
        print("Failed!")
        PodcastError.create(podcast)
        raise
Beispiel #6
0
def _scrape_index(url, verbose=False, max_=1000):
    try:
        html = download(url, gently=True)
    except requests_operational_errors:
        return
    doc = pyquery.PyQuery(html)
    links = doc(".thumbnails a")
    shows = []
    for link in links:
        show_url = link.attrib["href"]
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find("h4"):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list("name", flat=True)

    # XXX might not keep this
    shows = [(n, u) for (n, u) in shows if n not in existing_names]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print("Skipping", name, show_url)
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print("Skipping (no image)", name, rss_url)
            continue
        assert "://" in image_url, image_url

        podcast, created = Podcast.objects.get_or_create(name=name,
                                                         url=rss_url)
        podcast.image_url = image_url
        podcast.save()
        # try:
        #     podcast = Podcast.objects.get(name=name)
        #     podcast.url = rss_url
        #     podcast.image_url = image_url
        #     podcast.save()
        #     created = False
        # except Podcast.DoesNotExist:
        #     assert name, rss_url
        #     podcast = Podcast.objects.create(
        #         name=name,
        #         url=rss_url,
        #         image_url=image_url,
        #     )
        #     created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print("Got an error trying to download the image :(")
                print("IGNORING AND MOVING ON")
            PodcastError.create(podcast)

        if verbose:
            if created:
                print("CREATED")
            else:
                print("NOT NEW")
            print(repr(name))
Beispiel #7
0
def _download_episodes(podcast, verbose=True, timeout=10):
    assert podcast.name, podcast.id
    xml = download(podcast.url, timeout=timeout)
    d = feedparser.parse(xml)

    def get_duration(entry):
        if not entry.get("itunes_duration"):
            try:
                for link in entry["links"]:
                    if link["type"] == "audio/mpeg" or link["href"].lower(
                    ).endswith(".mp3"):
                        duration, error = parse_duration_ffmpeg(link["href"])
                        if error:
                            raise BadEpisodeDurationError(error)
                        return duration
            except KeyError:
                try:
                    print(entry.enclosure)
                    raise Exception(entry.enclosure)
                except AttributeError:
                    # no 'itunes:duration' and no links
                    print("SKIPPING", entry)
                    return
        elif entry["itunes_duration"].count(":") >= 1:
            try:
                itunes_duration = entry["itunes_duration"]
                # a bug in bad podcasts
                itunes_duration = itunes_duration.replace(">", "")
                itunes_duration = itunes_duration.replace(";", "")

                itunes_duration = [
                    int(float(x)) for x in itunes_duration.split(":")
                    if x.strip()
                ]
            except ValueError:
                print("SKIPPING, BAD itunes_duration")
                print(entry)
                print("itunes_duration=", repr(entry["itunes_duration"]))
                return
            duration = 0
            itunes_duration.reverse()
            duration += itunes_duration[0]  # seconds
            if len(itunes_duration) > 1:
                duration += 60 * itunes_duration[1]  # minutes
                if len(itunes_duration) > 2:
                    duration += 60 * 60 * itunes_duration[2]  # hours
            if duration > 24 * 60 * 60:
                entry["itunes_duration"] = None
                return get_duration(entry)
            return duration
        else:
            if not entry["itunes_duration"]:
                print("BUT!", xml.find("<itunes:duration"))
                return
            try:
                return int(float(entry["itunes_duration"]))
            except ValueError:
                # pprint(entry)
                print("SKIPPING itunes_duration not a number")
                print(repr(entry["itunes_duration"]))
                return

    for entry in d["entries"]:
        if not entry.get("published_parsed"):
            # print "Entry without a valid 'published_parsed'!"
            # print entry
            raise BadPodcastEntry(
                "Entry without a valid 'published_parsed'! ({})".format(
                    podcast.url))

        published = datetime.datetime.fromtimestamp(
            time.mktime(entry["published_parsed"]))
        if published.tzinfo is None:
            published = published.replace(tzinfo=timezone.utc)
        duration = get_duration(entry)
        if duration is None:
            continue
        try:
            guid = entry.guid
        except AttributeError:
            try:
                guid = entry.id
            except AttributeError:
                print("No guid or id. Going to use the summary.")
                try:
                    guid = hashlib.md5(
                        entry.summary.encode("utf-8")).hexdigest()
                except AttributeError:
                    print("No guid or id or summary. ",
                          "Going to use the title.")
                    guid = hashlib.md5(entry.title.encode("utf-8")).hexdigest()
                # raise
        try:
            Episode.objects.get(podcast=podcast, guid=guid)
            # if ep.duration != duration:
            #     print("DURATION CHANGED!!!")
            # else:
            #     print("Duration unchanged")
            # if ep.published != published:
            #     print("PUBLISHED CHANGED!!!")
            # else:
            #     print("Published unchanged")
        except Episode.DoesNotExist:
            pass

        metadata = dict(entry)
        title = strip_tags(metadata.get("title"))
        summary = strip_tags(metadata.get("summary"))

        try:
            episode = Episode.objects.get(podcast=podcast, guid=guid)
            episode.duration = duration
            episode.published = published
            episode.metadata = metadata
            episode.title = title
            episode.summary = summary

            try:
                episode.save()
                # print("SAVED")
            except DataError:
                print("FROM", podcast.url)
                print("ENTRY")
                print(entry)
                print("TRIED TO SAVE DURATION", duration)
                PodcastError.create(podcast, notes="Tried to save duration")
                raise
        except Episode.DoesNotExist:
            episode = Episode.objects.create(
                podcast=podcast,
                duration=duration,
                published=published,
                guid=guid,
                metadata=metadata,
                title=title,
                summary=summary,
            )
            print("CREATED episode")
        print((episode.podcast.name, episode.guid, episode.duration,
               episode.published))
    print("SETTING last_fetch ON {!r}".format(podcast))
    latest_episode = Episode.objects.filter(podcast=podcast).aggregate(
        latest=Max("published"))["latest"]
    print("SETTING latest_episode {!r}".format(latest_episode))
    # print(dir(podcast))
    podcast.refresh_from_db()
    # podcast = Podcast.objects.get(id=podcast.id)
    podcast.last_fetch = timezone.now()
    podcast.latest_episode = latest_episode
    podcast.save()
Beispiel #8
0
def _download_episodes(podcast, verbose=True):
    xml = download(podcast.url)
    d = feedparser.parse(xml)

    def get_duration(entry):
        if not entry.get('itunes_duration'):
            try:
                for link in entry['links']:
                    if (
                        link['type'] == 'audio/mpeg' or
                        link['href'].lower().endswith('.mp3')
                    ):
                        return parse_duration_ffmpeg(
                            link['href']
                        )
            except KeyError:
                try:
                    print entry.enclosure
                    raise Exception(entry.enclosure)
                except AttributeError:
                    # no 'itunes:duration' and no links
                    print "SKIPPING", entry
                    return
        elif entry['itunes_duration'].count(':') >= 1:
            try:
                itunes_duration = entry['itunes_duration']
                # a bug in bad podcasts
                itunes_duration = itunes_duration.replace('>', '')
                itunes_duration = itunes_duration.replace(';', '')

                itunes_duration = [
                    int(float(x)) for x in itunes_duration.split(':')
                    if x.strip()
                ]
            except ValueError:
                print "SKIPPING, BAD itunes_duration"
                print entry
                print 'itunes_duration=', repr(entry['itunes_duration'])
                return
            duration = 0
            itunes_duration.reverse()
            duration += itunes_duration[0]  # seconds
            if len(itunes_duration) > 1:
                duration += 60 * itunes_duration[1]  # minutes
                if len(itunes_duration) > 2:
                    duration += 60 * 60 * itunes_duration[2]  # hours
            if duration > 24 * 60 * 60:
                entry['itunes_duration'] = None
                return get_duration(entry)
            return duration
        else:
            if not entry['itunes_duration']:
                print "BUT!", xml.find('<itunes:duration')
                return
            try:
                return int(float(entry['itunes_duration']))
            except ValueError:
                # pprint(entry)
                print "SKIPPING itunes_duration not a number"
                print repr(entry['itunes_duration'])
                return

    for entry in d['entries']:
        if not entry.get('published_parsed'):
            # print "Entry without a valid 'published_parsed'!"
            # print entry
            raise BadPodcastEntry(
                "Entry without a valid 'published_parsed'! ({})".format(
                    podcast.url
                )
            )

        published = datetime.datetime.fromtimestamp(
            time.mktime(entry['published_parsed'])
        )
        if published.tzinfo is None:
            published = published.replace(tzinfo=timezone.utc)
        duration = get_duration(entry)
        if duration is None:
            continue
        try:
            guid = entry.guid
        except AttributeError:
            try:
                guid = entry.id
            except AttributeError:
                print "No guid or id. Going to use the summary."
                try:
                    guid = hashlib.md5(
                        entry.summary.encode('utf-8')
                    ).hexdigest()
                except AttributeError:
                    print "No guid or id or summary. ",
                    print "Going to use the title."
                    guid = hashlib.md5(
                        entry.title.encode('utf-8')
                    ).hexdigest()
                # raise
        try:
            ep = Episode.objects.get(
                podcast=podcast,
                guid=guid
            )
            if ep.duration != duration:
                print "DURATION CHANGED!!!"
            else:
                print "Duration unchanged"
            if ep.published != published:
                print "PUBLISHED CHANGED!!!"
            else:
                print "Published unchanged"
        except Episode.DoesNotExist:
            pass

        try:
            episode = Episode.objects.get(
                podcast=podcast,
                guid=guid
            )
            episode.duration = duration
            episode.published = published
            try:
                episode.save()
                print "SAVED",
            except DataError:
                print "FROM", podcast.url
                print "ENTRY"
                print entry
                print "TRIED TO SAVE DURATION", duration
                PodcastError.create(podcast, notes='Tried to save duration')
                raise
        except Episode.DoesNotExist:
            episode = Episode.objects.create(
                podcast=podcast,
                duration=duration,
                published=published,
                guid=guid,
            )
            print "CREATED",
        print (
            episode.podcast.name,
            episode.guid,
            episode.duration,
            episode.published
        )
    print("SETTING last_fetch ON {!r}".format(podcast))
    Podcast.objects.filter(id=podcast.id).update(last_fetch=timezone.now())
Beispiel #9
0
def _scrape_index(url, verbose=False, max_=1000):
    html = download(url, gently=True)
    doc = pyquery.PyQuery(html)
    links = doc('.thumbnails a')
    shows = []
    for link in links:
        show_url = link.attrib['href']
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find('h4'):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list('name', flat=True)

    # XXX might not keep this
    shows = [
        (n, u) for (n, u) in shows
        if n not in existing_names
    ]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print "Skipping", name, show_url
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print "Skipping (no image)", name, rss_url
            continue
        assert '://' in image_url, image_url
        # print "IMAGE_URL", image_url

        try:
            podcast = Podcast.objects.get(name=name)
            podcast.url = rss_url
            podcast.image_url = image_url
            podcast.save()
            created = False
        except Podcast.DoesNotExist:
            podcast = Podcast.objects.create(
                name=name,
                url=rss_url,
                image_url=image_url,
            )
            created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print "Got an error trying to download the image :("
                print "IGNORING AND MOVING ON"
            PodcastError.create(podcast)

        if verbose:
            if created:
                print "CREATED",
            else:
                print "NOT NEW",
            print repr(name)
Beispiel #10
0
def _scrape_index(url, verbose=False, max_=1000):
    try:
        html = download(url, gently=True)
    except requests_operational_errors:
        return
    doc = pyquery.PyQuery(html)
    links = doc(".thumbnails a")
    shows = []
    for link in links:
        show_url = link.attrib["href"]
        show_url = urljoin(url, show_url)
        link = pyquery.PyQuery(link)
        for h4 in link.find("h4"):
            name = h4.text_content()
        shows.append((name, show_url))

    existing_names = Podcast.objects.all().values_list("name", flat=True)

    # XXX might not keep this
    shows = [(n, u) for (n, u) in shows if n not in existing_names]
    random.shuffle(shows)
    for name, show_url in shows[:max_]:
        rss_url = _scrape_show(show_url)
        if not rss_url:
            print("Skipping", name, show_url)
            continue

        image_url = get_image_url(rss_url)
        if not image_url:
            print("Skipping (no image)", name, rss_url)
            continue
        assert "://" in image_url, image_url

        podcast, created = Podcast.objects.get_or_create(name=name, url=rss_url)
        podcast.image_url = image_url
        podcast.save()
        # try:
        #     podcast = Podcast.objects.get(name=name)
        #     podcast.url = rss_url
        #     podcast.image_url = image_url
        #     podcast.save()
        #     created = False
        # except Podcast.DoesNotExist:
        #     assert name, rss_url
        #     podcast = Podcast.objects.create(
        #         name=name,
        #         url=rss_url,
        #         image_url=image_url,
        #     )
        #     created = True
        try:
            podcast.download_image()
        except (AssertionError, NotAnImageError):
            if verbose:
                print("Got an error trying to download the image :(")
                print("IGNORING AND MOVING ON")
            PodcastError.create(podcast)

        if verbose:
            if created:
                print("CREATED")
            else:
                print("NOT NEW")
            print(repr(name))
Beispiel #11
0
def _download_episodes(podcast, verbose=True, timeout=10):
    assert podcast.name, podcast.id
    xml = download(podcast.url, timeout=timeout)
    d = feedparser.parse(xml)

    def get_duration(entry):
        if not entry.get("itunes_duration"):
            try:
                for link in entry["links"]:
                    if link["type"] == "audio/mpeg" or link["href"].lower().endswith(
                        ".mp3"
                    ):
                        duration, error = parse_duration_ffmpeg(link["href"])
                        if error:
                            raise BadEpisodeDurationError(error)
                        return duration
            except KeyError:
                try:
                    print(entry.enclosure)
                    raise Exception(entry.enclosure)
                except AttributeError:
                    # no 'itunes:duration' and no links
                    print("SKIPPING", entry)
                    return
        elif entry["itunes_duration"].count(":") >= 1:
            try:
                itunes_duration = entry["itunes_duration"]
                # a bug in bad podcasts
                itunes_duration = itunes_duration.replace(">", "")
                itunes_duration = itunes_duration.replace(";", "")

                itunes_duration = [
                    int(float(x)) for x in itunes_duration.split(":") if x.strip()
                ]
            except ValueError:
                print("SKIPPING, BAD itunes_duration")
                print(entry)
                print("itunes_duration=", repr(entry["itunes_duration"]))
                return
            duration = 0
            itunes_duration.reverse()
            duration += itunes_duration[0]  # seconds
            if len(itunes_duration) > 1:
                duration += 60 * itunes_duration[1]  # minutes
                if len(itunes_duration) > 2:
                    duration += 60 * 60 * itunes_duration[2]  # hours
            if duration > 24 * 60 * 60:
                entry["itunes_duration"] = None
                return get_duration(entry)
            return duration
        else:
            if not entry["itunes_duration"]:
                print("BUT!", xml.find("<itunes:duration"))
                return
            try:
                return int(float(entry["itunes_duration"]))
            except ValueError:
                # pprint(entry)
                print("SKIPPING itunes_duration not a number")
                print(repr(entry["itunes_duration"]))
                return

    for entry in d["entries"]:
        if not entry.get("published_parsed"):
            # print "Entry without a valid 'published_parsed'!"
            # print entry
            raise BadPodcastEntry(
                "Entry without a valid 'published_parsed'! ({})".format(podcast.url)
            )

        published = datetime.datetime.fromtimestamp(
            time.mktime(entry["published_parsed"])
        )
        if published.tzinfo is None:
            published = published.replace(tzinfo=timezone.utc)
        duration = get_duration(entry)
        if duration is None:
            continue
        try:
            guid = entry.guid
        except AttributeError:
            try:
                guid = entry.id
            except AttributeError:
                print("No guid or id. Going to use the summary.")
                try:
                    guid = hashlib.md5(entry.summary.encode("utf-8")).hexdigest()
                except AttributeError:
                    print("No guid or id or summary. ", "Going to use the title.")
                    guid = hashlib.md5(entry.title.encode("utf-8")).hexdigest()
                # raise
        try:
            Episode.objects.get(podcast=podcast, guid=guid)
            # if ep.duration != duration:
            #     print("DURATION CHANGED!!!")
            # else:
            #     print("Duration unchanged")
            # if ep.published != published:
            #     print("PUBLISHED CHANGED!!!")
            # else:
            #     print("Published unchanged")
        except Episode.DoesNotExist:
            pass

        metadata = dict(entry)
        title = strip_tags(metadata.get("title"))
        summary = strip_tags(metadata.get("summary"))

        try:
            episode = Episode.objects.get(podcast=podcast, guid=guid)
            episode.duration = duration
            episode.published = published
            episode.metadata = metadata
            episode.title = title
            episode.summary = summary

            try:
                episode.save()
                # print("SAVED")
            except DataError:
                print("FROM", podcast.url)
                print("ENTRY")
                print(entry)
                print("TRIED TO SAVE DURATION", duration)
                PodcastError.create(podcast, notes="Tried to save duration")
                raise
        except Episode.DoesNotExist:
            episode = Episode.objects.create(
                podcast=podcast,
                duration=duration,
                published=published,
                guid=guid,
                metadata=metadata,
                title=title,
                summary=summary,
            )
            print("CREATED episode")
        print((episode.podcast.name, episode.guid, episode.duration, episode.published))
    print("SETTING last_fetch ON {!r}".format(podcast))
    latest_episode = Episode.objects.filter(podcast=podcast).aggregate(
        latest=Max("published")
    )["latest"]
    print("SETTING latest_episode {!r}".format(latest_episode))
    # print(dir(podcast))
    podcast.refresh_from_db()
    # podcast = Podcast.objects.get(id=podcast.id)
    podcast.last_fetch = timezone.now()
    podcast.latest_episode = latest_episode
    podcast.save()