def get_latest(self, skip_pattern=None): """ get_latest is the heart of the application. It will get the current version on the web, extract its summary with readability and compare it against a previous version. If a difference is found it will compute the diff, save it as html and png files, and tell Internet Archive to create a snapshot. If a new version was found it will be returned, otherwise None will be returned. """ time_sleep = config.get("time_sleep", 0) if time_sleep > 0: time.sleep(time_sleep) # fetch the current readability-ized content for the page logging.info("checking %s", self.url) try: resp = _get(self.url) except Exception as e: logging.error("unable to fetch %s: %s", self.url, str(e)) return None if resp.status_code != 200: logging.warn("Got %s when fetching %s", resp.status_code, self.url) return None doc = readability.Document(to_utf8(resp.text)) title = doc.title() summary = doc.summary(html_partial=True) summary = bleach.clean(summary, tags=["p"], strip=True) summary = _normal(summary) # if the title or the summay contains the skipping pattern, # then return none as I don't want to report this change if skip_pattern and (matches(skip_pattern, title) or matches(skip_pattern, summary)): logging.info( "Skipped page. It matches the skip_pattern prop defined for this feed." ) return None # in case there was a redirect, and remove utm style marketing canonical_url = _remove_utm(resp.url) # get the latest version, if we have one versions = (EntryVersion.select().where( EntryVersion.url == canonical_url).order_by( -EntryVersion.created).limit(1)) if len(versions) == 0: old = None else: old = versions[0] # compare what we got against the latest version and create a # new version if it looks different, or is brand new (no old version) new = None # use _equal to determine if the summaries are the same if not old or old.title != title or not _equal(old.summary, summary): new = EntryVersion.create(title=title, url=canonical_url, summary=summary, entry=self) new.archive() if old: logging.debug("found new version %s", old.entry.url) diff = Diff.create(old=old, new=new) if not diff.generate(): logging.warn( "html diff showed no changes between versions #%s and #%s: %s", old.id, new.id, self.url, ) new.delete() new = None else: logging.debug("found first version: %s", self.url) else: logging.debug("content hasn't changed %s", self.url) self.checked = datetime.utcnow() self.save() return new
def test_matches_with_multiline(self): result = matches( self.skip_pattern, "Hey!\nYou need to subscribe to 10 articles\nto continue") self.assertTrue(result)
def test_matches_is_accent_insensitive(self): result = matches( self.skip_pattern, "Hey!\nYou need to SubsCribé to 10 ARTiclès\nto continue") self.assertTrue(result)
def test_matches_does_match(self): result = matches( self.skip_pattern, "Hey! You need to subscribe to 10 articles to continue") self.assertTrue(result)