Beispiel #1
0
def filter_clusters(limit=None):
    """Filter the whole MemeTracker dataset by copying all valid
    :class:`~.db.Cluster`\ s and :class:`~.db.Quote`\ s and setting their
    `filtered` attributes to `True`.

    Iterate through all the MemeTracker :class:`~.db.Cluster`\ s, and filter
    each of them to see if it's worth keeping. If a :class:`~.db.Cluster` is to
    be kept, the function creates a copy of it and all of its kept
    :class:`~.db.Quote`\ s, marking them as filtered. Progress of this
    operation is printed to stdout.

    Once the operation finishes, a VACUUM and an ANALYZE operation are run on
    the database so that it recomputes its optimisations.

    Parameters
    ----------
    limit : int, optional
        If not `None`, stop filtering after `limit` clusters have been seen
        (useful for testing purposes).

    Raises
    ------
    AlreadyFiltered
        If there are already some filtered :class:`~.db.Cluster`\ s or
        :class:`~.db.Quote`\ s stored in the database (indicating another
        filtering operation has already been completed, or started and
        aborted).

    """

    from brainscopypaste.db import Session, Cluster, save_by_copy

    logger.info('Filtering memetracker clusters')
    if limit is not None:
        logger.info('Filtering is limited to %s clusters', limit)

    click.echo('Filtering all clusters{}...'
               .format('' if limit is None else ' (limit={})'.format(limit)))

    # Check this isn't already done.
    with session_scope() as session:

        if session.query(Cluster)\
           .filter(Cluster.filtered.is_(True)).count() > 0:
            raise AlreadyFiltered('There are already some filtered '
                                  'clusters, aborting.')

        query = session.query(Cluster.id)
        if limit is not None:
            query = query.limit(limit)
        cluster_ids = [id for (id,) in query]

    logger.info('Got %s clusters to filter', len(cluster_ids))

    # Filter.
    objects = {'clusters': [], 'quotes': []}

    for cluster_id in ProgressBar()(cluster_ids):
        with session_scope() as session:

            cluster = session.query(Cluster).get(cluster_id)
            fcluster = cluster.filter()

            if fcluster is not None:
                logger.debug('Cluster #%s is kept with %s quotes',
                             cluster.sid, fcluster.size)
                objects['clusters'].append(fcluster)
                objects['quotes'].extend(fcluster.quotes)
            else:
                logger.debug('Cluster #%s is dropped', cluster.sid)

    click.secho('OK', fg='green', bold=True)
    logger.info('Kept %s clusters and %s quotes after filtering',
                len(objects['clusters']), len(objects['quotes']))

    # Save.
    logger.info('Saving filtered clusters to database')
    save_by_copy(**objects)

    # Vacuum analyze.
    logger.info('Vacuuming and analyzing database')
    click.echo('Vacuuming and analyzing... ', nl=False)
    execute_raw(Session.kw['bind'], 'VACUUM ANALYZE')
    click.secho('OK', fg='green', bold=True)
Beispiel #2
0
    def parse(self):
        """Parse the whole MemeTracker file, save, optimise the database, and
        check for consistency.

        Parse the MemeTracker file with :meth:`_parse` to create
        :class:`~.db.Cluster` and :class:`~.db.Quote` database entries
        corresponding to the dataset. The parsed data is then persisted to
        database in one step (with :func:`~.db.save_by_copy`). The database is
        then VACUUMed and ANALYZEd (with :func:`~.utils.execute_raw`) to force
        it to recompute its optimisations. Finally, the consistency of the
        database is checked (with :meth:`_check`) against number of quotes and
        frequency in each cluster of the original file, and against number of
        urls and frequency in each quote of the original file. Progress is
        printed to stdout.

        Note that if `self.limit` is not `None`, parsing will stop after
        `self.limit` clusters have been read.

        Once the parsing is finished, `self.parsed` is set to `True`.

        Raises
        ------
        ValueError
            If this instance has already run a parsing.

        """

        logger.info('Parsing memetracker file')
        if self.limit is not None:
            logger.info('Parsing is limited to %s clusters', self.limit)

        click.echo('Parsing MemeTracker data file into database{}...'
                   .format('' if self.limit is None
                           else ' (limit={})'.format(self.limit)))

        if self.parsed:
            raise ValueError('Parser has already run')

        # +100 is some margin for ProgressBar, otherwise it raises an exception
        # at the *end* of parsing (once the internal count exceeds max_value).
        lines_left = self.line_count - self.header_size + 100
        with open(self.filename, 'rb', encoding='utf8') as self._file, \
                ProgressBar(max_value=lines_left,
                            redirect_stdout=True) as self._bar:
            self._parse()

        click.secho('OK', fg='green', bold=True)
        logger.info('Parsed %s clusters and %s quotes from memetracker file',
                    len(self._objects['clusters']),
                    len(self._objects['quotes']))

        # Save.
        logger.info('Saving parsed clusters to database')
        save_by_copy(**self._objects)
        self._objects = {'clusters': [], 'quotes': []}

        # Vacuum analyze.
        logger.info('Vacuuming and analyzing database')
        click.echo('Vacuuming and analyzing... ', nl=False)
        execute_raw(Session.kw['bind'], 'VACUUM ANALYZE')
        click.secho('OK', fg='green', bold=True)

        # And check.
        logger.info('Checking consistency of the file against the database')
        click.echo('Checking consistency...')
        self._check()

        # Don't do this twice.
        self.parsed = True
        click.secho('All done.', fg='green', bold=True)