Example #1
0
    def _handle_quote(self, fields):
        """Handle a list of quote fields to create a new :class:`~.db.Quote`.

        The newly created :class:`~.db.Quote` is appended to
        `self._objects['quotes']`, and corresponding fields are created in
        `self._checks`.

        Parameters
        ----------
        fields : list of str
            List of fields defining the new quote, as returned by
            :meth:`_parse_line`.

        """

        id = int(fields[4])
        self._quote = Quote(cluster_id=self._cluster.id, id=id, sid=id,
                            filtered=False, string=fields[3])
        self._objects['quotes'].append(self._quote)

        # Save checks for later on.
        quote_size = int(fields[2])
        quote_frequency = int(fields[1])
        self._checks[self._cluster.id]['quotes'][self._quote.id] = {
            'size': quote_size,
            'frequency': quote_frequency
        }
Example #2
0
def test_cluster_too_long(filterable_cluster):
    # Modify our cluster to make it too long after quote filtering.
    with session_scope() as session:
        cluster = session.query(Cluster).first()
        # This quote is all good, but is too far from quote sid=0, leading
        # the cluster span to be too long.
        quote = Quote(sid=5, string='a string with enough '
                                    'words and no problems')
        quote.add_url(
            Url(timestamp=datetime.utcnow() + timedelta(days=80, hours=1),
                frequency=2, url_type='M', url='some-url')
        )
        cluster.quotes.append(quote)

    # Now check our cluster gets filtered out.
    with session_scope() as session:
        cluster = session.query(Cluster).first()
        assert cluster.filter() is None
Example #3
0
class MemeTrackerParser(Parser):

    """Parse the MemeTracker dataset into the database.

    After initialisation, the :meth:`parse` method does all the job. Its
    internal work is done by the utility methods :meth:`_parse`,
    :meth:`_parse_cluster_block` and :meth:`_parse_line` (for actual parsing),
    :meth:`_handle_cluster`, :meth:`_handle_quote` and :meth:`_handle_url` (for
    parsed data handling), and :meth:`_check` (for consistency checking).

    Parameters
    ----------
    filename : str
        Path to the MemeTracker dataset file to parse.
    line_count : int
        Number of lines in `filename`, to help in showing a progress bar.
        Should be computed beforehand with e.g. ``wc -l <filename>``, so python
        doesn't need to load the complete file twice.
    limit : int, optional
        If not `None` (default), stops the parsing once `limit` clusters have
        been read. Useful for testing purposes.

    """

    #: Size (in lines) of the header in the MemeTracker file to be parsed.
    header_size = 6

    def __init__(self, filename, line_count, limit=None):
        """Setup parsing and tracking attributes."""

        self.filename = filename
        self.line_count = line_count
        self.limit = limit

        # Keep track of if we've already parsed or not.
        self.parsed = False

        # Keep track of current cluster and quote.
        self._cluster = None
        self._quote = None

    def parse(self):
        """Parse the whole MemeTracker file, save, optimise the database, and
        check for consistency.

        Parse the MemeTracker file with :meth:`_parse` to create
        :class:`~.db.Cluster` and :class:`~.db.Quote` database entries
        corresponding to the dataset. The parsed data is then persisted to
        database in one step (with :func:`~.db.save_by_copy`). The database is
        then VACUUMed and ANALYZEd (with :func:`~.utils.execute_raw`) to force
        it to recompute its optimisations. Finally, the consistency of the
        database is checked (with :meth:`_check`) against number of quotes and
        frequency in each cluster of the original file, and against number of
        urls and frequency in each quote of the original file. Progress is
        printed to stdout.

        Note that if `self.limit` is not `None`, parsing will stop after
        `self.limit` clusters have been read.

        Once the parsing is finished, `self.parsed` is set to `True`.

        Raises
        ------
        ValueError
            If this instance has already run a parsing.

        """

        logger.info('Parsing memetracker file')
        if self.limit is not None:
            logger.info('Parsing is limited to %s clusters', self.limit)

        click.echo('Parsing MemeTracker data file into database{}...'
                   .format('' if self.limit is None
                           else ' (limit={})'.format(self.limit)))

        if self.parsed:
            raise ValueError('Parser has already run')

        # +100 is some margin for ProgressBar, otherwise it raises an exception
        # at the *end* of parsing (once the internal count exceeds max_value).
        lines_left = self.line_count - self.header_size + 100
        with open(self.filename, 'rb', encoding='utf8') as self._file, \
                ProgressBar(max_value=lines_left,
                            redirect_stdout=True) as self._bar:
            self._parse()

        click.secho('OK', fg='green', bold=True)
        logger.info('Parsed %s clusters and %s quotes from memetracker file',
                    len(self._objects['clusters']),
                    len(self._objects['quotes']))

        # Save.
        logger.info('Saving parsed clusters to database')
        save_by_copy(**self._objects)
        self._objects = {'clusters': [], 'quotes': []}

        # Vacuum analyze.
        logger.info('Vacuuming and analyzing database')
        click.echo('Vacuuming and analyzing... ', nl=False)
        execute_raw(Session.kw['bind'], 'VACUUM ANALYZE')
        click.secho('OK', fg='green', bold=True)

        # And check.
        logger.info('Checking consistency of the file against the database')
        click.echo('Checking consistency...')
        self._check()

        # Don't do this twice.
        self.parsed = True
        click.secho('All done.', fg='green', bold=True)

    def _parse(self):
        """Do the actual MemeTracker file parsing.

        Initialises the parsing tracking variables, then delegates each new
        cluster block to :meth:`_parse_cluster_block`. Parsed clusters and
        quotes are stored as :class:`~.db.Cluster`\ s and
        :class:`~.db.Quote`\ s in `self._objects` (to be saved later in
        :meth:`parse`). Frequency and url counts for clusters and quotes are
        saved in `self._checks` for later checking in :meth:`parse`.

        """

        # The first lines are not data.
        self._skip_header()

        # Initialize the parsing with the first line.
        self._cluster_line = self._file.readline()
        self._clusters_read = 0
        self._lines_read = 1
        self._bar.update(self._lines_read)

        # Results to be saved and checks to be done.
        self._objects = {'clusters': [], 'quotes': []}
        self._checks = {}

        while self._cluster_line is not None:
            logger.debug("Parsing new cluster ('%s')", self._cluster_line[:-1])
            self._parse_cluster_block()

    def _check(self):
        """Check the consistency of the database with `self._checks`.

        The original MemeTracker dataset specifies the number of quotes and
        frequency for each cluster, and the number of urls and frequency for
        each quote. This information is saved in `self._checks` during parsing.
        This method iterates through the whole database of saved
        :class:`~.db.Cluster`\ s and :class:`~.db.Quote`\ s to check that their
        counts correspond to what the MemeTracker dataset says (as stored in
        `self._checks`).

        Raises
        ------
        ValueError
            If any count in the database differs from its specification in
            `self._checks`.

        """

        for id, check in ProgressBar()(self._checks.items()):
            logger.debug('Checking cluster #%s consistency', id)

            with session_scope() as session:
                # Check the cluster itself.
                cluster = session.query(Cluster).get(id)
                err_end = (' #{} does not match value'
                           ' in file').format(cluster.sid)
                if check['cluster']['size'] != cluster.size:
                    raise ValueError("Cluster size" + err_end)
                if check['cluster']['frequency'] != cluster.frequency:
                    raise ValueError("Cluster frequency" + err_end)

                # Check each quote.
                for quote in cluster.quotes:
                    quote_check = check['quotes'][quote.id]
                    err_end = (' #{} does not match value'
                               ' in file').format(quote.sid)
                    if quote_check['size'] != quote.size:
                        raise ValueError("Quote size" + err_end)
                    if quote_check['frequency'] != quote.frequency:
                        raise ValueError("Quote frequency" + err_end)

        self._checks = {}

    def _parse_cluster_block(self):
        """Parse a block of lines representing a cluster in the source
        MemeTracker file.

        The :class:`~.db.Cluster` itself is first created from
        `self._cluster_line` with :meth:`_handle_cluster`, then each following
        line is delegated to :meth:`_handle_quote` or :meth:`_handle_url` until
        exhaustion of this cluster block. During the parsing of this cluster,
        `self._cluster` holds the current cluster being filled and
        `self._quote` the current quote (both are cleaned up when the method
        finishes). At the end of this block, the method increments
        `self._clusters_read` and sets `self._cluster_line` to the line
        defining the next cluster, or `None` if the end of file or `self.limit`
        was reached.

        Raises
        ------
        ValueError
            If `self._cluster_line` is not a line defining a new cluster.

        """

        # Check we have a cluster line and parse it.
        tipe, fields = self._parse_line(self._cluster_line)
        # If self._cluster_line stays None, _parse() stops.
        # So it's filled further down when we get to the next cluster
        # definition line (unless self.limit says we should read
        # only a subset of all clusters).
        self._cluster_line = None
        if tipe != 'cluster':
            raise ValueError("Our supposed cluster_line ('{}', line {}) "
                             "is not a cluster line!"
                             .format(self._cluster_line,
                                     self._lines_read + self.header_size))

        # Create the cluster.
        self._handle_cluster(fields)

        # Keep reading until the next cluster, or exhaustion.
        for line in self._file:
            self._lines_read += 1
            self._bar.update(self._lines_read)

            tipe, fields = self._parse_line(line)
            if tipe == 'cluster':
                break
            elif tipe == 'quote':
                self._handle_quote(fields)
            elif tipe == 'url':
                self._handle_url(fields)

        # If we just saw a new cluster, feed that new cluster_line
        # for the next cluster, unless asked to stop.
        self._clusters_read += 1
        if (tipe == 'cluster' and
                (self.limit is None or self._clusters_read < self.limit)):
            self._cluster_line = line

        # Clean up.
        self._cluster = None
        self._quote = None

    @classmethod
    def _parse_line(self, line):
        """Parse `line` to determine if it's a cluster-, quote- or url-line, or
        anything else.

        Parameters
        ----------
        line : str
            A line from the MemeTracker dataset to parse.

        Returns
        -------
        tipe : str in {'cluster', 'quote', 'url'} or None
            The type of object that `line` defines; `None` if unknown or empty
            line.
        fields : list of str
            List of the tab-separated fields in `line`.

        """

        line0 = re.split(r'[\xa0\s+\t\r\n]+', line)
        if line0[0] != '':
            tipe = 'cluster'
        elif line[0] == '\t' and line[1] != '\t':
            tipe = 'quote'
        elif line[0] == '\t' and line[1] == '\t' and line[2] != '\t':
            tipe = 'url'
        else:
            tipe = None
        return tipe, re.split(r'[\t\r\n]', line)

    def _handle_cluster(self, fields):
        """Handle a list of cluster fields to create a new :class:`~.db.Cluster`.

        The newly created :class:`~.db.Cluster` is appended to
        `self._objects['clusters']`, and corresponding fields are created in
        `self._checks`.

        Parameters
        ----------
        fields : list of str
            List of fields defining the new cluster, as returned by
            :meth:`_parse_line`.

        """

        id = int(fields[3])
        self._cluster = Cluster(id=id, sid=id, filtered=False,
                                source='memetracker')
        self._objects['clusters'].append(self._cluster)

        # Save checks for later on.
        cluster_size = int(fields[0])
        cluster_frequency = int(fields[1])
        self._checks[self._cluster.id] = {
            'quotes': {},
            'cluster': {
                'size': cluster_size,
                'frequency': cluster_frequency
            }
        }

    def _handle_quote(self, fields):
        """Handle a list of quote fields to create a new :class:`~.db.Quote`.

        The newly created :class:`~.db.Quote` is appended to
        `self._objects['quotes']`, and corresponding fields are created in
        `self._checks`.

        Parameters
        ----------
        fields : list of str
            List of fields defining the new quote, as returned by
            :meth:`_parse_line`.

        """

        id = int(fields[4])
        self._quote = Quote(cluster_id=self._cluster.id, id=id, sid=id,
                            filtered=False, string=fields[3])
        self._objects['quotes'].append(self._quote)

        # Save checks for later on.
        quote_size = int(fields[2])
        quote_frequency = int(fields[1])
        self._checks[self._cluster.id]['quotes'][self._quote.id] = {
            'size': quote_size,
            'frequency': quote_frequency
        }

    def _handle_url(self, fields):
        """Handle a list of url fields to create a new :class:`~.db.Url`.

        The newly created :class:`~.db.Url` is stored on `self._quote` which
        holds the currently parsed quote.

        Parameters
        ----------
        fields : list of str
            List of fields defining the new url, as returned by
            :meth:`_parse_line`.

        """

        timestamp = datetime.strptime(fields[2], '%Y-%m-%d %H:%M:%S')
        assert timestamp.tzinfo is None

        url = Url(timestamp=timestamp, frequency=int(fields[3]),
                  url_type=fields[4], url=fields[5])
        self._quote.add_url(url)