Ejemplo n.º 1
0
def test_langdetect():
    assert langdetect('') is None
    assert langdetect('Dear sir, please open the door') == 'en'
Ejemplo n.º 2
0
    def filter(self):
        """Filter this :class:`~.db.Cluster` and its children
        :class:`~.db.Quote`\ s to see if they're worth keeping.

        First, iterate through all the children :class:`~.db.Quote`\ s of the
        cluster, seeing if each one of them is worth keeping. A
        :class:`~.db.Quote` is discarded if it has no urls, less than
        :data:`~.settings.MT_FILTER_MIN_TOKENS`, spans longer than
        :data:`~.settings.MT_FILTER_MAX_DAYS`, or is not in English. Any
        :class:`~.db.Quote` that has none of those problems will be kept.

        If after this filtering there are no :class:`~.db.Quote`\ s left, or
        the :class:`~.db.Cluster` made of the remaining :class:`~.db.Quote`\ s
        still spans longer than :data:`~.settings.MT_FILTER_MAX_DAYS`, the
        cluster and all its quotes will be discarded and `None` is returned.
        If not, a new :class:`~.db.Cluster` is created with `cluster.filtered =
        True` and `cluster.id = original_cluster.id +`
        :func:`filter_cluster_offset`. That new cluster points to copies of all
        the kept :class:`~.db.Quote`\ s, with `quote.filtered = True` and
        `quote.id = original_quote.id +` :func:`filter_quote_offset`. All those
        models (new cluster and new quotes) should later be saved to the
        database (the method does not do it for you), e.g. by running this
        method inside a :func:`~.utils.session_scope`.

        Returns
        -------
        cluster : :class:`~.db.Cluster` or None
            The filtered cluster pointing to filtered quotes, or `None` if it
            is to be discarded.

        Raises
        ------
        AlreadyFiltered
            If this cluster is already filtered (i.e.
            :attr:`~.db.Cluster.filtered` is `True`).

        """

        if self.filtered:
            raise AlreadyFiltered('Cluster is already filtered')

        min_tokens = settings.MT_FILTER_MIN_TOKENS
        max_span = timedelta(days=settings.MT_FILTER_MAX_DAYS)
        fcluster = self.clone(id=filter_cluster_offset() + self.id,
                              filtered=True)

        # Examine each quote for min_tokens, max_days, and language.
        for quote in self.quotes:

            if quote.frequency == 0:
                logger.debug('Dropping quote #%s (cluster #%s): '
                             'no urls', quote.sid, self.sid)
                continue

            if len(quote.tokens) < min_tokens:
                logger.debug('Dropping quote #%s (cluster #%s): '
                             'not enough tokens', quote.sid, self.sid)
                continue

            if quote.span > max_span:
                logger.debug('Dropping quote #%s (cluster #%s): '
                             'span too big', quote.sid, self.sid)
                continue

            if langdetect(quote.string) != 'en':
                logger.debug('Dropping quote #%s (cluster #%s): '
                             'not English', quote.sid, self.sid)
                continue

            logger.debug('Keeping quote #%s (cluster #%s)',
                         quote.sid, self.sid)
            fquote = quote.clone(id=filter_quote_offset() + quote.id,
                                 cluster_id=fcluster.id, filtered=True)
            fcluster.quotes.append(fquote)

        # If no quotes where kept, drop the whole cluster.
        if fcluster.size == 0:
            logger.debug('Dropping cluster #%s: no quotes left', self.sid)
            return

        # Finally, if the new cluster spans too many days, discard it.
        if fcluster.span > max_span:
            logger.debug('Dropping cluster #%s: span too big', self.sid)
            return

        logger.debug('Keeping cluster #%s after filtering', self.sid)
        return fcluster