Exemple #1
0
def _get_cached_results(page, conn, mode, noskip):
    query1 = """SELECT cache_time, cache_queries, cache_process_time,
                       cache_possible_miss
                FROM cache
                WHERE cache_id = ?"""
    query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
                FROM cache_data
                WHERE cdata_cache_id = ?"""
    cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())

    cursor = conn.cursor()
    cursor.execute(query1, (cache_id,))
    results = cursor.fetchall()
    if not results:
        return None
    cache_time, queries, check_time, possible_miss = results[0]
    if possible_miss and noskip:
        return None
    if not isinstance(cache_time, datetime):
        cache_time = datetime.utcfromtimestamp(cache_time)
    if datetime.utcnow() - cache_time > timedelta(days=3):
        return None
    cursor.execute(query2, (cache_id,))
    data = cursor.fetchall()

    if not data:  # TODO: do something less hacky for this edge case
        article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
        result = CopyvioCheckResult(False, [], queries, check_time,
                                    article_chain, possible_miss)
        result.cached = True
        result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
        result.cache_age = _format_date(cache_time)
        return result

    url, confidence, skipped, excluded = data.pop(0)
    if skipped:  # Should be impossible: data must be bad; run a new check
        return None
    result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
    if abs(result.confidence - confidence) >= 0.0001:
        return None

    for url, confidence, skipped, excluded in data:
        if noskip and skipped:
            return None
        source = CopyvioSource(None, url)
        source.confidence = confidence
        source.skipped = bool(skipped)
        source.excluded = bool(excluded)
        result.sources.append(source)
    result.queries = queries
    result.time = check_time
    result.possible_miss = possible_miss
    result.cached = True
    result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
    result.cache_age = _format_date(cache_time)
    return result
Exemple #2
0
    def enqueue(self, urls, exclude_check=None):
        """Put a list of URLs into the various worker queues.

        *exclude_check* is an optional exclusion function that takes a URL and
        returns ``True`` if we should skip it and ``False`` otherwise.
        """
        for url in urls:
            with self._queues.lock:
                if url in self._handled_urls:
                    continue
                self._handled_urls.add(url)

                source = CopyvioSource(url=url, **self._source_args)
                self.sources.append(source)

                if exclude_check and exclude_check(url):
                    self._logger.debug(u"enqueue(): exclude {0}".format(url))
                    source.excluded = True
                    source.skip()
                    continue
                if self._short_circuit and self.finished:
                    self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
                    source.skip()
                    continue

                try:
                    key = tldextract.extract(url).registered_domain
                except ImportError:  # Fall back on very naive method
                    from urlparse import urlparse
                    key = u".".join(urlparse(url).netloc.split(".")[-2:])

                logmsg = u"enqueue(): {0} {1} -> {2}"
                if key in self._queues.sites:
                    self._logger.debug(logmsg.format("append", key, url))
                    self._queues.sites[key].append(source)
                else:
                    self._logger.debug(logmsg.format("new", key, url))
                    self._queues.sites[key] = queue = deque()
                    queue.append(source)
                    self._queues.unassigned.put((key, queue))