Exemple #1
0
    def copyvio_compare(self, url, min_confidence=0.75, max_time=30):
        """Check the page like :py:meth:`copyvio_check` against a specific URL.

        This is essentially a reduced version of :meth:`copyvio_check` - a
        copyivo comparison is made using Markov chains and the result is
        returned in a :class:`.CopyvioCheckResult` object - but without using a
        search engine, since the suspected "violated" URL is supplied from the
        start.

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Tool Labs site. After a
        search is done, the resulting URL is stored in a cache for 72 hours so
        future checks against that page will not require another set of
        time-and-money-consuming search engine queries. However, the comparison
        itself (which includes the article's and the source's content) cannot
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither :exc:`.UnknownSearchEngineError`
        nor :exc:`.SearchQueryError` will be raised.
        """
        log = u"Starting copyvio compare for [[{0}]] against {1}"
        self._logger.info(log.format(self.title, url))
        article = MarkovChain(ArticleTextParser(self.get()).strip())
        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            max_time, num_workers=1)
        workspace.enqueue([url])
        workspace.wait()
        result = workspace.get_result()
        self._logger.info(result.get_log_message(self.title))
        return result
Exemple #2
0
    def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
                      no_searches=False, no_links=False, short_circuit=True):
        """Check the page for copyright violations.

        Returns a :class:`.CopyvioCheckResult` object with information on the
        results of the check.

        *min_confidence* is the minimum amount of confidence we must have in
        the similarity between a source text and the article in order for us to
        consider it a suspected violation. This is a number between 0 and 1.

        *max_queries* is self-explanatory; we will never make more than this
        number of queries in a given check.

        *max_time* can be set to prevent copyvio checks from taking longer than
        a set amount of time (generally around a minute), which can be useful
        if checks are called through a web server with timeouts. We will stop
        checking new URLs as soon as this limit is reached.

        Setting *no_searches* to ``True`` will cause only URLs in the wikitext
        of the page to be checked; no search engine queries will be made.
        Setting *no_links* to ``True`` will cause the opposite to happen: URLs
        in the wikitext will be ignored; search engine queries will be made
        only. Setting both of these to ``True`` is pointless.

        Normally, the checker will short-circuit if it finds a URL that meets
        *min_confidence*. This behavior normally causes it to skip any
        remaining URLs and web queries, but setting *short_circuit* to
        ``False`` will prevent this.

        Raises :exc:`.CopyvioCheckError` or subclasses
        (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on
        errors.
        """
        log = u"Starting copyvio check for [[{0}]]"
        self._logger.info(log.format(self.title))
        searcher = self._get_search_engine()
        parser = ArticleTextParser(self.get())
        article = MarkovChain(parser.strip())
        parser_args = {}

        if self._exclusions_db:
            self._exclusions_db.sync(self.site.name)
            exclude = lambda u: self._exclusions_db.check(self.site.name, u)
            parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints(
                self.site.name)
        else:
            exclude = None

        workspace = CopyvioWorkspace(
            article, min_confidence, max_time, self._logger, self._addheaders,
            short_circuit=short_circuit, parser_args=parser_args)

        if article.size < 20:  # Auto-fail very small articles
            result = workspace.get_result()
            self._logger.info(result.get_log_message(self.title))
            return result

        if not no_links:
            workspace.enqueue(parser.get_links(), exclude)
        num_queries = 0
        if not no_searches:
            chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
            for chunk in chunks:
                if short_circuit and workspace.finished:
                    workspace.possible_miss = True
                    break
                log = u"[[{0}]] -> querying {1} for {2!r}"
                self._logger.debug(log.format(self.title, searcher.name, chunk))
                workspace.enqueue(searcher.search(chunk), exclude)
                num_queries += 1
                sleep(1)

        workspace.wait()
        result = workspace.get_result(num_queries)
        self._logger.info(result.get_log_message(self.title))
        return result