def up(cursor):
    to_build = _need_collxml(cursor)
    num_todo = len(to_build)

    batch_size = 100
    logger.info('collection.xml to generate: {}'.format(num_todo))
    logger.info('Batch size: {}'.format(batch_size))

    start = time.time()
    guesstimate = 0.01 * num_todo
    guess_complete = guesstimate + start
    logger.info('Completion guess: '
                '"{}" ({})'.format(time.ctime(guess_complete),
                                   timedelta(0, guesstimate)))

    num_complete = 0
    for batch in _batcher(to_build, batch_size):
        coll_idents = tuple([i[0] for i in batch])
        logger.debug('coll_idents {}'.format(coll_idents))

        for coll_ident in coll_idents:
            _build_collxml(coll_ident, cursor)

        cursor.connection.commit()
        num_complete += len(batch)
        percent_comp = num_complete * 100.0 / num_todo
        elapsed = time.time() - start
        remaining_est = elapsed * (num_todo - num_complete) / num_complete
        est_complete = start + elapsed + remaining_est
        logger.info('{:.1f}% complete '
                    'est: "{}" ({})'.format(percent_comp,
                                            time.ctime(est_complete),
                                            timedelta(0, remaining_est)))

    logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
Beispiel #2
0
def generate_update_values(nodeid, title):
    """Returns a sequence of trees.nodeid and trees.slug
    to be used to update the trees slug table value.

    """
    logger.info("processing... {} - {}".format(nodeid, title))
    try:
        slug = generate_slug(*title)
    except:
        logger.exception("failed to create slug for '{}'".format(title))
        raise
    logger.info("... using {}".format(slug))
    # must return an array of a single type for postgresql
    return [str(nodeid), slug]
Beispiel #3
0
def should_run(cursor, limit='LIMIT 1'):
    version = rhaptos.cnxmlutils.__version__
    version_text = '%data-cnxml-to-html-ver="{}"%'.format(version)
    logger.info('Looking for {}'.format(version_text))
    cursor.execute("""\
WITH index_cnxml_html AS (
    SELECT files.fileid, module_ident, file
        FROM module_files NATURAL JOIN files
        WHERE filename = 'index.cnxml.html'
) SELECT fileid, max(module_ident)
    FROM index_cnxml_html
    WHERE convert_from(file, 'utf-8') NOT LIKE %s
    GROUP BY fileid {}""".format(limit),
                   (version_text,))
    return cursor.fetchall()
def up(cursor):
    """Add canonical for all that do not have it"""
    to_update = should_run(cursor, limit='')
    num_todo = len(to_update)

    batch_size = 1000
    logger.info('Pages to update: {}'.format(num_todo))
    logger.info('Batch size: {}'.format(batch_size))

    start = time.time()
    guesstimate = 0.01 * num_todo
    guess_complete = guesstimate + start
    logger.info('Completion guess: '
                '"{}" ({})'.format(time.ctime(guess_complete),
                                   timedelta(0, guesstimate)))

    num_complete = 0
    for batch in _batcher(to_update, batch_size):
        module_idents = tuple([i[0] for i in batch])

        cursor.execute("""UPDATE modules
            SET canonical = default_canonical_book(uuid)
            WHERE module_ident IN %s""", (module_idents,))

        cursor.connection.commit()
        num_complete += len(batch)
        percent_comp = num_complete * 100.0 / num_todo
        elapsed = time.time() - start
        remaining_est = elapsed * (num_todo - num_complete) / num_complete
        est_complete = start + elapsed + remaining_est
        logger.info('{:.1f}% complete '
                    'est: "{}" ({})'.format(percent_comp,
                                            time.ctime(est_complete),
                                            timedelta(0, remaining_est)))

    logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
def up(cursor):
    """Create fulltext indexes for books"""
    to_index = should_run(cursor)
    num_todo = len(to_index)

    batch_size = 100
    logger.info('Books to index {}'.format(num_todo))
    logger.info('Batch size: {}'.format(batch_size))

    start = time.time()
    guesstimate = 0.01 * num_todo
    guess_complete = guesstimate + start
    logger.info('Completion guess: '
                '"{}" ({})'.format(time.ctime(guess_complete),
                                   timedelta(0, guesstimate)))

    module_idents = tuple([i[0] for i in to_index])
    num_complete = 0
    for batch in _batcher(module_idents, batch_size):

        for module_ident in batch:
            cursor.execute("SELECT insert_book_fti(%(module_ident)s)",
                           {'module_ident': module_ident})

        cursor.connection.commit()
        num_complete += len(batch)
        percent_comp = num_complete * 100.0 / num_todo
        elapsed = time.time() - start
        remaining_est = elapsed * (num_todo - num_complete) / num_complete
        est_complete = start + elapsed + remaining_est
        logger.info('{:.1f}% complete '
                    'est: "{}" ({})'.format(percent_comp,
                                            time.ctime(est_complete),
                                            timedelta(0, remaining_est)))

    logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
Beispiel #6
0
def up(cursor):
    """Transform all index.cnxml to index.cnxml.html"""
    # Get all the index.cnxml.html that does not have rhaptos.cnxmlutils
    # current version
    to_transform = should_run(cursor, limit='')
    num_todo = len(to_transform)

    batch_size = 100
    logger.info('Pages to transform: {}'.format(num_todo))
    logger.info('Batch size: {}'.format(batch_size))

    start = time.time()
    guesstimate = 0.01 * num_todo
    guess_complete = guesstimate + start
    logger.info('Completion guess: '
                '"{}" ({})'.format(time.ctime(guess_complete),
                                   timedelta(0, guesstimate)))
    module_idents = tuple([i[1] for i in to_transform])

    # Check if datamigrations.index_cnxml_html exists, else create it
    cursor.execute("CREATE SCHEMA IF NOT EXISTS datamigrations")
    cursor.execute("""\
CREATE TABLE IF NOT EXISTS datamigrations.index_cnxml_html
    ( LIKE module_files )""")
    # Store module_files for modules we are going to update
    cursor.execute("""\
INSERT INTO datamigrations.index_cnxml_html
    SELECT * FROM module_files
        WHERE module_ident IN %s
          AND filename = 'index.cnxml.html'
          AND NOT EXISTS (
            SELECT 1 FROM datamigrations.index_cnxml_html b
                WHERE b.module_ident = module_files.module_ident);
UPDATE datamigrations.index_cnxml_html b
    SET fileid = module_files.fileid
    FROM module_files
    WHERE module_files.module_ident = b.module_ident
      AND module_files.filename = b.filename
      AND module_files.fileid != b.fileid
      AND module_files.module_ident IN %s""",
                   (module_idents, module_idents))

    num_complete = 0
    for batch in _batcher(to_transform, batch_size):
        module_idents = tuple([i[1] for i in batch])
        logger.debug('Transform module_idents {}'.format(module_idents))

        for fileid, module_ident in batch:
            cursor.execute("""\
WITH index_cnxml AS (
    SELECT files.fileid, file
        FROM module_files NATURAL JOIN files
        WHERE module_ident = %(module_ident)s
          AND filename = 'index.cnxml'
        LIMIT 1
), transformed AS (
    SELECT html_content(%(module_ident)s) AS content
) INSERT INTO files
    (file, media_type)
    SELECT convert_to(transformed.content, 'utf-8'), 'text/xml'
        FROM index_cnxml, transformed
        WHERE char_length(substring(encode(file, 'escape')
                                    FROM 'cnxml-version=.0.7.')) > 0
          AND NOT EXISTS (
            SELECT 1 FROM files
                WHERE sha1 = sha1(transformed.content))
RETURNING fileid""", {'module_ident': module_ident})
            new_fileid = cursor.fetchall()
            if new_fileid:
                cursor.execute("""\
UPDATE module_files SET fileid = %s
WHERE fileid = %s""", (new_fileid[0][0], fileid))

        cursor.connection.commit()
        num_complete += len(batch)
        percent_comp = num_complete * 100.0 / num_todo
        elapsed = time.time() - start
        remaining_est = elapsed * (num_todo - num_complete) / num_complete
        est_complete = start + elapsed + remaining_est
        logger.info('{:.1f}% complete '
                    'est: "{}" ({})'.format(percent_comp,
                                            time.ctime(est_complete),
                                            timedelta(0, remaining_est)))

    logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
Beispiel #7
0
def up(cursor):
    # Create sql function for reducing the dimension of an array
    cursor.execute(CREATE_REDUCE_DIM)

    # Roll over all collated tree records.
    # Cannot iterate over the results, because we need the cursor for
    # updating the records we are rolling over.
    logger.info("starting query of entire trees table... *tick tock*")
    cursor.execute(TREE_QUERY)
    records = cursor.fetchall()

    # Provide generate status information to the user
    num_todo = len(records)
    logger.info('Items to update: {}'.format(num_todo))
    logger.info('Batch size: {}'.format(BATCH_SIZE))

    # Time the entire process
    start = time.time()
    guesstimate = 0.01 * num_todo
    guess_complete = guesstimate + start
    logger.info('Completion guess: "{}" ({})'.format(
        time.ctime(guess_complete),
        timedelta(0, guesstimate),
    ))

    # Iteratively update the trees records in batches
    num_complete = 0
    for batch in batcher(records, BATCH_SIZE):
        updates = [
            generate_update_values(nodeid, title) for nodeid, title in batch
        ]
        cursor.execute(UPDATE_STMT, (updates, ))
        cursor.connection.commit()

        # print out time information after each batch
        num_complete += len(batch)
        percent_comp = num_complete * 100.0 / num_todo
        elapsed = time.time() - start
        remaining_est = elapsed * (num_todo - num_complete) / num_complete
        est_complete = start + elapsed + remaining_est
        logger.info('{:.1f}% complete '
                    'est: "{}" ({})'.format(percent_comp,
                                            time.ctime(est_complete),
                                            timedelta(0, remaining_est)))

    total_time = timedelta(0, time.time() - start)
    logger.info('Total runtime: {}'.format(total_time))

    cursor.execute(DROP_REDUCE_DIM)
    purge_contents_cache()