def needsRecovery(self): """Do we have holding tables with recoverable data from previous run? Returns Boolean answer. """ cur = cursor() # If there are any holding tables to be poured into their source # tables, there must at least be one for the last table that pour() # processes. last_holding_table = self.getRawHoldingTableName(self.tables[-1]) if not postgresql.have_table(cur, last_holding_table): return False # If the first table in our list also still exists, and it still has # its new_id column, then the pouring process had not begun yet. # Assume the data was not ready for pouring. first_holding_table = self.getRawHoldingTableName(self.tables[0]) if postgresql.table_has_column(cur, first_holding_table, 'new_id'): self.logger.info( "Previous run aborted too early for recovery; redo all") return False self.logger.info("Recoverable data found") return True
def pour(self, transaction_manager): """Pour data from holding tables back into source tables. Rows in the holding table that have their new_id set to null are skipped. The transaction manager is committed and re-opened after every batch run. Batch sizes are dynamically adjusted to meet the stated time goal. """ if self.last_extracted_table is None: if not self.needsRecovery(): raise AssertionError("Can't pour: no tables extracted") elif self.last_extracted_table != len(self.tables) - 1: raise AssertionError( "Not safe to pour: last table '%s' was not extracted" % self.tables[-1]) cur = self._commit(transaction_manager) # Don't let postgres revert to slow sequential scans while we pour. # That might otherwise happen to the holding table as its vital "id" # index degrades with the removal of rows. postgresql.allow_sequential_scans(cur, False) # Main loop: for each of the source tables being copied, see if # there's a matching holding table. If so, prepare it, pour it back # into the source table, and drop. for table in self.tables: holding_table_unquoted = self.getRawHoldingTableName(table) if not postgresql.have_table(cur, holding_table_unquoted): # We know we're in a suitable state for pouring. If this # table does not exist, it must be because it's been poured # out completely and dropped in an earlier instance of this # loop, before the failure we're apparently recovering from. continue holding_table = self.getHoldingTableName(table) self.logger.info("Pouring %s back into %s..." % (holding_table, table)) tablestarttime = time.time() has_new_id = postgresql.table_has_column( cur, holding_table_unquoted, 'new_id') self._pourTable( holding_table, table, has_new_id, transaction_manager) # Drop holding table. It may still contain rows with id set to # null. Those must not be poured. postgresql.drop_tables(cursor(), holding_table) self.logger.debug( "Pouring %s took %.3f seconds." % (holding_table, time.time() - tablestarttime)) cur = self._commit(transaction_manager) # In future, let the database perform sequential scans again if it # decides that's best. postgresql.allow_sequential_scans(cur, True)
def pour(self, transaction_manager): """Pour data from holding tables back into source tables. Rows in the holding table that have their new_id set to null are skipped. The transaction manager is committed and re-opened after every batch run. Batch sizes are dynamically adjusted to meet the stated time goal. """ if self.last_extracted_table is None: if not self.needsRecovery(): raise AssertionError("Can't pour: no tables extracted") elif self.last_extracted_table != len(self.tables) - 1: raise AssertionError( "Not safe to pour: last table '%s' was not extracted" % self.tables[-1]) cur = self._commit(transaction_manager) # Don't let postgres revert to slow sequential scans while we pour. # That might otherwise happen to the holding table as its vital "id" # index degrades with the removal of rows. postgresql.allow_sequential_scans(cur, False) # Main loop: for each of the source tables being copied, see if # there's a matching holding table. If so, prepare it, pour it back # into the source table, and drop. for table in self.tables: holding_table_unquoted = self.getRawHoldingTableName(table) if not postgresql.have_table(cur, holding_table_unquoted): # We know we're in a suitable state for pouring. If this # table does not exist, it must be because it's been poured # out completely and dropped in an earlier instance of this # loop, before the failure we're apparently recovering from. continue holding_table = self.getHoldingTableName(table) self.logger.info("Pouring %s back into %s..." % (holding_table, table)) tablestarttime = time.time() has_new_id = postgresql.table_has_column(cur, holding_table_unquoted, 'new_id') self._pourTable(holding_table, table, has_new_id, transaction_manager) # Drop holding table. It may still contain rows with id set to # null. Those must not be poured. postgresql.drop_tables(cursor(), holding_table) self.logger.debug("Pouring %s took %.3f seconds." % (holding_table, time.time() - tablestarttime)) cur = self._commit(transaction_manager) # In future, let the database perform sequential scans again if it # decides that's best. postgresql.allow_sequential_scans(cur, True)