Beispiel #1
0
    def get_latest(self, skip_pattern=None):
        """
        get_latest is the heart of the application. It will get the current
        version on the web, extract its summary with readability and compare
        it against a previous version. If a difference is found it will
        compute the diff, save it as html and png files, and tell Internet
        Archive to create a snapshot.

        If a new version was found it will be returned, otherwise None will
        be returned.
        """

        time_sleep = config.get("time_sleep", 0)
        if time_sleep > 0:
            time.sleep(time_sleep)

        # fetch the current readability-ized content for the page
        logging.info("checking %s", self.url)
        try:
            resp = _get(self.url)
        except Exception as e:
            logging.error("unable to fetch %s: %s", self.url, str(e))
            return None

        if resp.status_code != 200:
            logging.warn("Got %s when fetching %s", resp.status_code, self.url)
            return None

        doc = readability.Document(to_utf8(resp.text))
        title = doc.title()
        summary = doc.summary(html_partial=True)
        summary = bleach.clean(summary, tags=["p"], strip=True)
        summary = _normal(summary)

        # if the title or the summay contains the skipping pattern,
        # then return none as I don't want to report this change
        if skip_pattern and (matches(skip_pattern, title)
                             or matches(skip_pattern, summary)):
            logging.info(
                "Skipped page. It matches the skip_pattern prop defined for this feed."
            )
            return None

        # in case there was a redirect, and remove utm style marketing
        canonical_url = _remove_utm(resp.url)

        # get the latest version, if we have one
        versions = (EntryVersion.select().where(
            EntryVersion.url == canonical_url).order_by(
                -EntryVersion.created).limit(1))
        if len(versions) == 0:
            old = None
        else:
            old = versions[0]

        # compare what we got against the latest version and create a
        # new version if it looks different, or is brand new (no old version)
        new = None

        # use _equal to determine if the summaries are the same
        if not old or old.title != title or not _equal(old.summary, summary):
            new = EntryVersion.create(title=title,
                                      url=canonical_url,
                                      summary=summary,
                                      entry=self)
            new.archive()
            if old:
                logging.debug("found new version %s", old.entry.url)
                diff = Diff.create(old=old, new=new)
                if not diff.generate():
                    logging.warn(
                        "html diff showed no changes between versions #%s and #%s: %s",
                        old.id,
                        new.id,
                        self.url,
                    )
                    new.delete()
                    new = None
            else:
                logging.debug("found first version: %s", self.url)
        else:
            logging.debug("content hasn't changed %s", self.url)

        self.checked = datetime.utcnow()
        self.save()

        return new
Beispiel #2
0
 def test_matches_with_multiline(self):
     result = matches(
         self.skip_pattern,
         "Hey!\nYou need to subscribe to 10 articles\nto continue")
     self.assertTrue(result)
Beispiel #3
0
 def test_matches_is_accent_insensitive(self):
     result = matches(
         self.skip_pattern,
         "Hey!\nYou need to SubsCribé to 10 ARTiclès\nto continue")
     self.assertTrue(result)
Beispiel #4
0
 def test_matches_does_match(self):
     result = matches(
         self.skip_pattern,
         "Hey! You need to subscribe to 10 articles to continue")
     self.assertTrue(result)