def _run(self): if self.c.num_records_converted > 0: self.chunk.status = 'importing' self.chunk.update() start = datetime.now() if self.c.destination_type == 'crate': self._import_to_crate() elif self.c.destination_type == 'mysql': self._import_to_mysql() else: raise Error('Unknown destination type %r' % (self.c.destination_type,)) self.log('Import to destination finished num_records_imported=%s destination_host=%s elapsed=%s', self.c.num_records_imported, self.c.destination_host, datetime.now() - start) self.chunk.status = 'imported' self.chunk.num_records_imported = self.c.num_records_imported self.chunk.import_elapsed_ms = int((datetime.now() - start).total_seconds() * 1000) self.chunk.end_time = int(time.time() * 1000) self.chunk.update() else: self.log('No rows to import') self.chunk.status = 'empty' self.chunk.end_time = int(time.time() * 1000) self.chunk.update() if config.ENABLE_VERIFIER: verifier.queue_verification(self.c) # TODO(jpatrin): Refactor this into its own task so it can be independently retried try: cmd = ( 'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR ' '-i %s -p %r %s@%s "sudo bash -c \'rm -f %s\'"' % ( config.SSH_PRIVKEY, self.c.destination_ssh_port, config.SSH_USER, self.c.destination_host, ssh.escape_double_quotes(ssh.escape_single_quotes(self.c.import_filename)))) rm_cmd = subprocess.Popen( cmd, shell=True, stdin=subprocess.PIPE) rm_cmd.stdin.close() rm_cmd.wait() if rm_cmd.returncode != 0: raise CommandException('Removing file on destination server failed with exit code %r' % ( rm_cmd.returncode,)) except Exception, e: # We catch and log all exceptions here to make this task idempotent. We DO NOT want this task to be # retried at this point as duplicate imports can fail on mysql and would cause us to corrupt the crate # chunk import records. self.log('Exception during removal of destination file, removal will not be retried %r import_filename=%s', e, self.c.import_filename)
def _run(self): time.sleep(random.randint(0, 10)) start = datetime.now() if self.c.source_type != 'mysql': raise UnrecoverableError('This exporter only supports mysql sources, passed in source_type was %r' % ( self.c.source_type,)) self.chunk.status = 'exporting' self.chunk.start_time = int(time.time() * 1000) self.chunk.update() source_host = config.SOURCES[self.c.source_shard]['config']['read_host']['host'] with db.shard_connection(self.c.source_shard, read=True) as mysql_conn: self.log('Starting export') mysql_host_user = '******' % ( config.SSH_USER, source_host) cmd = ( 'sudo mkdir -p %(source_dir)s && sudo chmod 777 %(source_dir)s; ' '[ ! -e %(outfile)s ] || sudo rm %(outfile)s' ) % { 'tmp_dir': config.TMP_DIR, 'source_dir': worker.SOURCE_DIR, 'outfile': self.c.export_filename, } sshcmd = ( 'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR ' '-o ControlMaster=no -o ControlPath=none ' '-p %r -i %s %s "%s"' % ( config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, mysql_host_user, ssh.escape_double_quotes(cmd))) rm_existing = subprocess.Popen( sshcmd, shell=True, stdin=subprocess.PIPE) rm_existing.stdin.close() rm_existing.wait() if rm_existing.returncode != 0: raise CommandException('Checking for and removing export file failed with exit code %r' % ( rm_existing.returncode,)) # Inline trimming may be in self.c.where_clauses wheres = copy.deepcopy(self.c.where_clauses) where_values = copy.deepcopy(self.c.where_values) wheres.extend([ '%s >= %%(?)s' % (self.c.table_config.chunk_col,), '%s < %%(?)s' % (self.c.table_config.chunk_col,), ]) where_values.extend([ self.c.start_id, self.c.start_id + self.c.chunk_size, ]) # We want to kick off the streamer here to enable streaming while the export file is still being written # but if we do and the export below fails the streamer will be stuck in an infinite loop and the next time # the export task gets retried we'll kick off yet another streamer task, potentially corrupting data. # TODO(jpatrin): A fix for this would be to use a random token so the streamer knows it belongs to the # running exporter. Before kicking off the streamer, write a file with a random UUID next to where the # exported file will be. Put the token in self.c so it gets passed to the streamer. When the streamer # starts up, read the token file and check it vs. the token in self.c. If it's different, mark the chunk # as failed, end the streamer without doing anything, and don't retry the task. #streamer.queue_stream_chunk(self.c) # TODO(jpatrin): Don't we need to add the join clause here to make the where_clauses work for inline # trimming? sql = ( ( "SELECT %(columns)s INTO OUTFILE '%(outfile)s' CHARACTER SET utf8 " # We use | as an escape character as MySQL's default of \ is problematic and at some times is not # compatible with the csv module's parser. """FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '|' """ "LINES TERMINATED BY '\\n' " "FROM %(schema)s.%(table)s %(table_alias)s WHERE %(wheres)s" # Adding sorting here only slows down the query. # "ORDER BY %(chunk_col)s ASC" ) % { 'columns': ', '.join( ( # We're using NULL_SENTINEL here because MySQL uses a nonstandard value for null # in its OUTFILE which is not as easy to detect and convert as I'd like. "IF(%s IS NULL, '%s', %s)" % ( col.name, NULL_SENTINEL, mysql_conn.column_query_sql(col) ) ) for col in self.c.export_columns ), 'outfile': self.c.export_filename, 'schema': self.c.source_schema, 'table': self.c.table_config.table_name, 'table_alias': self.c.table_config.table_alias, 'wheres': ' AND '.join(wheres), } ) with db.cursor(mysql_conn) as cur: cur.execute(sql % {'?': mysql_conn.PARAMETER_PLACEHOLDER}, where_values) self.c.num_records_exported = cur.rowcount # kick off the streamer streamer.queue_stream_chunk(self.c) self.chunk.num_records_exported = self.c.num_records_exported self.chunk.export_elapsed_ms = int((datetime.now() - start).total_seconds() * 1000) self.chunk.update() # The streaming may or may not have started, so we only update the status if it's still set to exporting self.chunk.update_status('exported', where_status='exporting') # signal to the processor that we have reached the end of the data self.log('Signaling EOF to conversion') cmd = 'sudo bash -c "echo STOP >> %s"' % ( ssh.escape_double_quotes(self.c.export_filename), ) signal_stop = subprocess.Popen( 'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR ' '-p %r -i %s %s@%s "%s"' % ( config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, config.SSH_USER, source_host, ssh.escape_double_quotes(cmd)), shell=True, stdin=subprocess.PIPE) signal_stop.stdin.close() signal_stop.wait() self.log('Finished chunk export num_records_exported=%s elapsed=%s', self.c.num_records_exported, datetime.now() - start)