Exemple #1
0
 def save(self, *args, **kwargs):
     super(Source, self).save(*args, **kwargs)
     if self.last_failure is None:
         failures = self.scrape_failures.filter(resolved__isnull=True)
         for f in failures:
             f.resolved = now()
             f.save()
Exemple #2
0
def scrape_release(source, feed, entry, link):
    title_text = entry.get('title')
    if not isinstance(title_text, unicode):
        title_text = title_text.encode('utf-8', 'ignore')
    title = kill_control_characters(title_text)
    date_text = (entry.get('published') or
                 entry.get('updated') or
                 entry.get('a10:updated'))
    date = dateutil.parser.parse(date_text) if date_text else now()
    body = get_link_content(link)
    if body is None:
        return

    try:
        # Does not use get_or_create because the unique constraint is just the url
        # and we don't want the source foreign key field to ever be null.
        release = Release.objects.get(url=link)
        release.title = title
        release.date = date
        release.body = body
        release.source = source
        release.save()
    except Release.DoesNotExist:
        release = Release.objects.create(url=link,
                                         source=source,
                                         title=title,
                                         date=date,
                                         body=body)
Exemple #3
0
 def is_stale(self, seconds=None):
     seconds = seconds or settings.SCRAPE_PERIOD
     if self.last_retrieved is None:
         return True
     since_last = now() - self.last_retrieved
     if since_last.total_seconds() > seconds:
         return True
     return False
    def handle(self, *args, **options):
        if not hasattr(settings, "SUPERFASTMATCH"):
            raise CommandError("You must configure SUPERFASTMATCH in your project settings.")

        self.sfm = from_django_conf()

        for url in args:
            try:
                if url.startswith("http://") or url.startswith("https://"):
                    release = Release.objects.get(url=url)
                    body = get_link_content(release.url)
                    release.title = kill_control_characters(release.title)
                    release.body = body
                    release.updated = now()
                    release.save()
                    logging.info("Updated release {0}: {1}".format(release.id, release.url))
                else:
                    logging.warning("Skipping non-HTTP link {0}".format(release.url))
            except Exception as e:
                logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))
Exemple #5
0
    def handle(self, *args, **options):
        if not hasattr(settings, 'SUPERFASTMATCH'):
            raise CommandError('You must configure SUPERFASTMATCH in your project settings.')

        if not hasattr(settings, 'DEFAULT_DOCTYPE'):
            raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.')

        self.sfm = from_django_conf()

        sources = Source.objects.filter(source_type=2)
        if len(args) == 1:
            arg = args[0]
            if arg.startswith('http://') or arg.startswith('https://'):
                sources = sources.filter(url=arg)
            else:
                try:
                    sources = sources.filter(id=int(arg))
                except ValueError:
                    raise CommandError("Arguments must be source IDs or feed URLs")

        for source in sources:
            try:
                if source.is_stale() or options['including_stale']:
                    self.scrape_releases(source)
                    source.last_retrieved = now()
                    source.last_failure = None
                    source.save()

            except SourceScrapeFailure as failure:
                failure.save()

            except Exception as e:
                buf = StringIO()
                print_exc(1000, buf)
                failure = SourceScrapeFailure.objects.create(source=source,
                                                             traceback=buf.getvalue(),
                                                             description=unicode(e))