def up(cursor): to_build = _need_collxml(cursor) num_todo = len(to_build) batch_size = 100 logger.info('collection.xml to generate: {}'.format(num_todo)) logger.info('Batch size: {}'.format(batch_size)) start = time.time() guesstimate = 0.01 * num_todo guess_complete = guesstimate + start logger.info('Completion guess: ' '"{}" ({})'.format(time.ctime(guess_complete), timedelta(0, guesstimate))) num_complete = 0 for batch in _batcher(to_build, batch_size): coll_idents = tuple([i[0] for i in batch]) logger.debug('coll_idents {}'.format(coll_idents)) for coll_ident in coll_idents: _build_collxml(coll_ident, cursor) cursor.connection.commit() num_complete += len(batch) percent_comp = num_complete * 100.0 / num_todo elapsed = time.time() - start remaining_est = elapsed * (num_todo - num_complete) / num_complete est_complete = start + elapsed + remaining_est logger.info('{:.1f}% complete ' 'est: "{}" ({})'.format(percent_comp, time.ctime(est_complete), timedelta(0, remaining_est))) logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
def generate_update_values(nodeid, title): """Returns a sequence of trees.nodeid and trees.slug to be used to update the trees slug table value. """ logger.info("processing... {} - {}".format(nodeid, title)) try: slug = generate_slug(*title) except: logger.exception("failed to create slug for '{}'".format(title)) raise logger.info("... using {}".format(slug)) # must return an array of a single type for postgresql return [str(nodeid), slug]
def should_run(cursor, limit='LIMIT 1'): version = rhaptos.cnxmlutils.__version__ version_text = '%data-cnxml-to-html-ver="{}"%'.format(version) logger.info('Looking for {}'.format(version_text)) cursor.execute("""\ WITH index_cnxml_html AS ( SELECT files.fileid, module_ident, file FROM module_files NATURAL JOIN files WHERE filename = 'index.cnxml.html' ) SELECT fileid, max(module_ident) FROM index_cnxml_html WHERE convert_from(file, 'utf-8') NOT LIKE %s GROUP BY fileid {}""".format(limit), (version_text,)) return cursor.fetchall()
def up(cursor): """Add canonical for all that do not have it""" to_update = should_run(cursor, limit='') num_todo = len(to_update) batch_size = 1000 logger.info('Pages to update: {}'.format(num_todo)) logger.info('Batch size: {}'.format(batch_size)) start = time.time() guesstimate = 0.01 * num_todo guess_complete = guesstimate + start logger.info('Completion guess: ' '"{}" ({})'.format(time.ctime(guess_complete), timedelta(0, guesstimate))) num_complete = 0 for batch in _batcher(to_update, batch_size): module_idents = tuple([i[0] for i in batch]) cursor.execute("""UPDATE modules SET canonical = default_canonical_book(uuid) WHERE module_ident IN %s""", (module_idents,)) cursor.connection.commit() num_complete += len(batch) percent_comp = num_complete * 100.0 / num_todo elapsed = time.time() - start remaining_est = elapsed * (num_todo - num_complete) / num_complete est_complete = start + elapsed + remaining_est logger.info('{:.1f}% complete ' 'est: "{}" ({})'.format(percent_comp, time.ctime(est_complete), timedelta(0, remaining_est))) logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
def up(cursor): """Create fulltext indexes for books""" to_index = should_run(cursor) num_todo = len(to_index) batch_size = 100 logger.info('Books to index {}'.format(num_todo)) logger.info('Batch size: {}'.format(batch_size)) start = time.time() guesstimate = 0.01 * num_todo guess_complete = guesstimate + start logger.info('Completion guess: ' '"{}" ({})'.format(time.ctime(guess_complete), timedelta(0, guesstimate))) module_idents = tuple([i[0] for i in to_index]) num_complete = 0 for batch in _batcher(module_idents, batch_size): for module_ident in batch: cursor.execute("SELECT insert_book_fti(%(module_ident)s)", {'module_ident': module_ident}) cursor.connection.commit() num_complete += len(batch) percent_comp = num_complete * 100.0 / num_todo elapsed = time.time() - start remaining_est = elapsed * (num_todo - num_complete) / num_complete est_complete = start + elapsed + remaining_est logger.info('{:.1f}% complete ' 'est: "{}" ({})'.format(percent_comp, time.ctime(est_complete), timedelta(0, remaining_est))) logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
def up(cursor): """Transform all index.cnxml to index.cnxml.html""" # Get all the index.cnxml.html that does not have rhaptos.cnxmlutils # current version to_transform = should_run(cursor, limit='') num_todo = len(to_transform) batch_size = 100 logger.info('Pages to transform: {}'.format(num_todo)) logger.info('Batch size: {}'.format(batch_size)) start = time.time() guesstimate = 0.01 * num_todo guess_complete = guesstimate + start logger.info('Completion guess: ' '"{}" ({})'.format(time.ctime(guess_complete), timedelta(0, guesstimate))) module_idents = tuple([i[1] for i in to_transform]) # Check if datamigrations.index_cnxml_html exists, else create it cursor.execute("CREATE SCHEMA IF NOT EXISTS datamigrations") cursor.execute("""\ CREATE TABLE IF NOT EXISTS datamigrations.index_cnxml_html ( LIKE module_files )""") # Store module_files for modules we are going to update cursor.execute("""\ INSERT INTO datamigrations.index_cnxml_html SELECT * FROM module_files WHERE module_ident IN %s AND filename = 'index.cnxml.html' AND NOT EXISTS ( SELECT 1 FROM datamigrations.index_cnxml_html b WHERE b.module_ident = module_files.module_ident); UPDATE datamigrations.index_cnxml_html b SET fileid = module_files.fileid FROM module_files WHERE module_files.module_ident = b.module_ident AND module_files.filename = b.filename AND module_files.fileid != b.fileid AND module_files.module_ident IN %s""", (module_idents, module_idents)) num_complete = 0 for batch in _batcher(to_transform, batch_size): module_idents = tuple([i[1] for i in batch]) logger.debug('Transform module_idents {}'.format(module_idents)) for fileid, module_ident in batch: cursor.execute("""\ WITH index_cnxml AS ( SELECT files.fileid, file FROM module_files NATURAL JOIN files WHERE module_ident = %(module_ident)s AND filename = 'index.cnxml' LIMIT 1 ), transformed AS ( SELECT html_content(%(module_ident)s) AS content ) INSERT INTO files (file, media_type) SELECT convert_to(transformed.content, 'utf-8'), 'text/xml' FROM index_cnxml, transformed WHERE char_length(substring(encode(file, 'escape') FROM 'cnxml-version=.0.7.')) > 0 AND NOT EXISTS ( SELECT 1 FROM files WHERE sha1 = sha1(transformed.content)) RETURNING fileid""", {'module_ident': module_ident}) new_fileid = cursor.fetchall() if new_fileid: cursor.execute("""\ UPDATE module_files SET fileid = %s WHERE fileid = %s""", (new_fileid[0][0], fileid)) cursor.connection.commit() num_complete += len(batch) percent_comp = num_complete * 100.0 / num_todo elapsed = time.time() - start remaining_est = elapsed * (num_todo - num_complete) / num_complete est_complete = start + elapsed + remaining_est logger.info('{:.1f}% complete ' 'est: "{}" ({})'.format(percent_comp, time.ctime(est_complete), timedelta(0, remaining_est))) logger.info('Total runtime: {}'.format(timedelta(0, elapsed)))
def up(cursor): # Create sql function for reducing the dimension of an array cursor.execute(CREATE_REDUCE_DIM) # Roll over all collated tree records. # Cannot iterate over the results, because we need the cursor for # updating the records we are rolling over. logger.info("starting query of entire trees table... *tick tock*") cursor.execute(TREE_QUERY) records = cursor.fetchall() # Provide generate status information to the user num_todo = len(records) logger.info('Items to update: {}'.format(num_todo)) logger.info('Batch size: {}'.format(BATCH_SIZE)) # Time the entire process start = time.time() guesstimate = 0.01 * num_todo guess_complete = guesstimate + start logger.info('Completion guess: "{}" ({})'.format( time.ctime(guess_complete), timedelta(0, guesstimate), )) # Iteratively update the trees records in batches num_complete = 0 for batch in batcher(records, BATCH_SIZE): updates = [ generate_update_values(nodeid, title) for nodeid, title in batch ] cursor.execute(UPDATE_STMT, (updates, )) cursor.connection.commit() # print out time information after each batch num_complete += len(batch) percent_comp = num_complete * 100.0 / num_todo elapsed = time.time() - start remaining_est = elapsed * (num_todo - num_complete) / num_complete est_complete = start + elapsed + remaining_est logger.info('{:.1f}% complete ' 'est: "{}" ({})'.format(percent_comp, time.ctime(est_complete), timedelta(0, remaining_est))) total_time = timedelta(0, time.time() - start) logger.info('Total runtime: {}'.format(total_time)) cursor.execute(DROP_REDUCE_DIM) purge_contents_cache()