def _get_cached_results(page, conn, mode, noskip): query1 = """SELECT cache_time, cache_queries, cache_process_time, cache_possible_miss FROM cache WHERE cache_id = ?""" query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded FROM cache_data WHERE cdata_cache_id = ?""" cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest()) cursor = conn.cursor() cursor.execute(query1, (cache_id,)) results = cursor.fetchall() if not results: return None cache_time, queries, check_time, possible_miss = results[0] if possible_miss and noskip: return None if not isinstance(cache_time, datetime): cache_time = datetime.utcfromtimestamp(cache_time) if datetime.utcnow() - cache_time > timedelta(days=3): return None cursor.execute(query2, (cache_id,)) data = cursor.fetchall() if not data: # TODO: do something less hacky for this edge case article_chain = MarkovChain(ArticleTextParser(page.get()).strip()) result = CopyvioCheckResult(False, [], queries, check_time, article_chain, possible_miss) result.cached = True result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") result.cache_age = _format_date(cache_time) return result url, confidence, skipped, excluded = data.pop(0) if skipped: # Should be impossible: data must be bad; run a new check return None result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30) if abs(result.confidence - confidence) >= 0.0001: return None for url, confidence, skipped, excluded in data: if noskip and skipped: return None source = CopyvioSource(None, url) source.confidence = confidence source.skipped = bool(skipped) source.excluded = bool(excluded) result.sources.append(source) result.queries = queries result.time = check_time result.possible_miss = possible_miss result.cached = True result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") result.cache_age = _format_date(cache_time) return result
def enqueue(self, urls, exclude_check=None): """Put a list of URLs into the various worker queues. *exclude_check* is an optional exclusion function that takes a URL and returns ``True`` if we should skip it and ``False`` otherwise. """ for url in urls: with self._queues.lock: if url in self._handled_urls: continue self._handled_urls.add(url) source = CopyvioSource(url=url, **self._source_args) self.sources.append(source) if exclude_check and exclude_check(url): self._logger.debug(u"enqueue(): exclude {0}".format(url)) source.excluded = True source.skip() continue if self._short_circuit and self.finished: self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) source.skip() continue try: key = tldextract.extract(url).registered_domain except ImportError: # Fall back on very naive method from urlparse import urlparse key = u".".join(urlparse(url).netloc.split(".")[-2:]) logmsg = u"enqueue(): {0} {1} -> {2}" if key in self._queues.sites: self._logger.debug(logmsg.format("append", key, url)) self._queues.sites[key].append(source) else: self._logger.debug(logmsg.format("new", key, url)) self._queues.sites[key] = queue = deque() queue.append(source) self._queues.unassigned.put((key, queue))