def test_run(self):
     self.mox.StubOutWithMock(exporter_mysql, 'time')
     exporter_mysql.time.sleep(mox.IgnoreArg())
     exporter_mysql.time.time().AndReturn(42)
     self.chunk.update()
     self.mox.StubOutWithMock(db, 'shard_connection')
     conn = self.mox.CreateMockAnything()
     db.shard_connection('src', read=True).AndReturn(self.context(conn))
     self.mox.StubOutWithMock(subprocess, 'Popen')
     proc = self.mox.CreateMockAnything()
     subprocess.Popen(mox.IgnoreArg(), shell=True, stdin=subprocess.PIPE).AndReturn(proc)
     proc.stdin = self.mox.CreateMockAnything()
     proc.stdin.close()
     proc.wait()
     proc.returncode = 0
     self.mox.StubOutWithMock(db, 'cursor')
     cur = self.mox.CreateMockAnything()
     db.cursor(conn).AndReturn(self.context(cur))
     cur.execute(mox.IgnoreArg(), mox.IgnoreArg())
     cur.rowcount = 1
     self.mox.StubOutWithMock(exporter_mysql.streamer, 'queue_stream_chunk')
     exporter_mysql.streamer.queue_stream_chunk(self.config)
     self.chunk.update()
     self.chunk.update_status('exported', where_status='exporting')
     proc = self.mox.CreateMockAnything()
     subprocess.Popen(mox.IgnoreArg(), shell=True, stdin=subprocess.PIPE).AndReturn(proc)
     proc.stdin = self.mox.CreateMockAnything()
     proc.stdin.close()
     proc.wait()
     self.mox.ReplayAll()
     self.worker._run()
def main():
    print('Generating data')
    with db.shard_connection('mysql', read=False) as conn:
        for host_id in xrange(1, 11):
            print('.')
            name = str(uuid.uuid4())
            time = datetime.now() - timedelta(days=365)
            with db.cursor(conn) as cur:
                cur.execute(
                    'INSERT INTO shinkansen.host (id, name, created) VALUES (%s, %s, %s)',
                    (host_id, name, time)
                )
            while time < datetime.now():
                status = 'up' if random.random() < 0.9 else 'down'
                with db.cursor(conn) as cur:
                    cur.execute(
                        'INSERT INTO shinkansen.host_checkin (host_id, time, status) VALUES (%s, %s, %s)',
                        (host_id, time, status)
                    )
                if status == 'up':
                    for metric in ['cpu', 'mem', 'disk']:
                        with db.cursor(conn) as cur:
                            cur.execute(
                                'INSERT INTO shinkansen.host_metric (host_id, name, time, value) '
                                'VALUES (%s, %s, %s, %s)',
                                (host_id, metric, time, random.random())
                            )
                time += timedelta(days=1)
        conn.commit()
    print('done')
 def test_get_count(self):
     cur = self.mox.CreateMockAnything()
     self.mox.StubOutWithMock(db, 'cursor')
     db.cursor(None).AndReturn(self.context(cur))
     cur.execute(mox.IgnoreArg())
     cur.fetchone().AndReturn((42,))
     self.mox.ReplayAll()
     self.assertEqual(42, shinkansen_trim_crate_data.get_count(None, ''))
    def get_column_metadata(self, conn):
        columns = []
        with db.cursor(conn, dictionary=True) as cur:
            cur.execute(
                'SELECT * FROM information_schema.columns WHERE schema_name = %(?)s AND table_name = %(?)s ' % {
                    '?': conn.PARAMETER_PLACEHOLDER
                },
                (self.c.source_schema.lower(), self.c.table_config.table_name.lower())
            )
            column_recs = cur.fetchall()
        pk_cols = set()
        with db.cursor(conn) as cur:
            cur.execute(
                'SELECT constraint_name FROM information_schema.table_constraints '
                'WHERE constraint_type = %(?)s AND schema_name = %(?)s AND table_name = %(?)s' % {
                    '?': conn.PARAMETER_PLACEHOLDER
                },
                ('PRIMARY_KEY', self.c.source_schema.lower(), self.c.table_config.table_name.lower())
            )
            for (constraint_name,) in cur.fetchall():
                # constraint_name is a list of the columns in the key
                for column in constraint_name:
                    pk_cols.add(column.lower())

        for column in column_recs:
            col = db.Column(
                column['column_name'],
                self.TYPE_MAP[column['data_type']],
                column['column_name'].lower() in pk_cols,
                ignore=(column['column_name'].lower() in self.c.table_config.ignore_columns),
            )

            columns.append(col)
            self.column_map[col.lname] = col

        if (
            self.c.migration_type == orm.MigrationType.DELTA
            or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT
        ):
            # Check the destination for the primary key columns as well since the schemas may be different
            with db.shard_connection(self.c.destination_shard, read=True) as conn:
                primary_key = conn.get_table_primary_key_columns(
                    self.c.destination_schema,
                    self.c.table_config.table_name
                )
            for col_name in primary_key:
                if col_name.lower() not in self.column_map:
                    raise UnrecoverableError(
                        'Primary key column in destination does not exist in source '
                        'table=%s column=%s source=%s destination=%s' % (
                            self.c.table_config.table_name, col_name, self.c.source_shard, self.c.destination_shard))
                self.column_map[col_name.lower()].is_primary_key = True
        self.c.columns = columns
Exemple #5
0
 def upsert(self, conn, records, _recursed=False):
     with db.cursor(conn) as cur:
         sql = (
             'INSERT INTO %(schema)s.%(table)s (%(columns)s) VALUES (%(placeholders)s) '
             'ON DUPLICATE KEY UPDATE %(sets)s'
         ) % {
             'schema': self.c.destination_schema,
             'table': self.c.table_config.table_name,
             'columns': ', '.join(col.name for col in self.c.export_columns),
             'placeholders': ', '.join(
                 conn.column_insert_sql(col) for col in self.c.export_columns
             ) % {'?': conn.PARAMETER_PLACEHOLDER},
             'sets': ', '.join(
                 '%s = VALUES(%s)' % (col.name, col.name)
                 for col in self.c.export_columns
                 if not col.is_primary_key
             ),
         }
         try:
             results = cur.executemany(sql, records)
         except Exception, e:
             if config.IGNORE_CONSTRAINT_FAILURES and 'foreign key constraint fails' in str(e) and not _recursed:
                 self.log('Foreign key constraint error, trying one by one')
                 for record in records:
                     try:
                         self.upsert(conn, [record], _recursed=True)
                     except Exception, e:
                         self.log('Record failed insert due to %r: %r', e, record)
                 return
             else:
 def _import_to_crate(self):
     with db.shard_connection(self.c.destination_shard, read=True) as conn:
         self.log('Starting import')
         with db.cursor(conn) as cur:
             sql = "COPY %s.%s FROM '%s'" % (
                 self.c.destination_schema, self.c.table_config.table_name, self.c.import_filename)
             cur.execute(sql)
             self.c.num_records_imported = cur.rowcount
         conn.commit()
    def test_get_table_metadata_full(self):
        self.mox.StubOutWithMock(queuer.db, 'shard_connection')
        conn = self.mox.CreateMockAnything()
        queuer.db.shard_connection('src', read=True).AndReturn(self.context(conn))
        self.mox.StubOutWithMock(queuer.db, 'cursor')
        cur = self.mox.CreateMockAnything()
        conn.get_current_timestamp().AndReturn(1)
        self.tt.update()
        self.mox.StubOutWithMock(queuer.QueueChunksWorker, 'get_column_metadata')
        queuer.QueueChunksWorker.get_column_metadata(conn)
        db.cursor(conn).AndReturn(self.context(cur))
        cur.execute(mox.IgnoreArg(), mox.IgnoreArg())
        cur.fetchone().AndReturn((1, 2, 3))

        self.mox.ReplayAll()
        self.config.migration_type = orm.MigrationType.FULL
        worker = queuer.QueueChunksWorker(self.config, self.redis)
        worker.get_table_metadata()
def main():
    with db.shard_connection('crate', read=True) as conn:
        print('Creating shinkansen.host')
        with db.cursor(conn) as cur:
            cur.execute('DROP TABLE IF EXISTS shinkansen.host')
        with db.cursor(conn) as cur:
            cur.execute('''
            CREATE TABLE shinkansen.host (
                id INT PRIMARY KEY,
                name String,
                created TIMESTAMP
            )''')
        print('Creating shinkansen.host_checkin')
        with db.cursor(conn) as cur:
            cur.execute('DROP TABLE IF EXISTS shinkansen.host_checkin')
        with db.cursor(conn) as cur:
            cur.execute('''CREATE TABLE shinkansen.host_checkin (
                host_id INT,
                time TIMESTAMP,
                status String
            )''')
        print('Creating shinkansen.host_metric')
        with db.cursor(conn) as cur:
            cur.execute('DROP TABLE IF EXISTS shinkansen.host_metric')
        with db.cursor(conn) as cur:
            cur.execute('''CREATE TABLE shinkansen.host_metric (
                host_id INT,
                name String,
                time TIMESTAMP,
                value String
            )''')
    def test_get_table_metadata_delta(self):
        self.config.delta_start = 12345
        self.mox.StubOutWithMock(queuer.db, 'shard_connection')
        conn = self.mox.CreateMockAnything()
        queuer.db.shard_connection('src', read=True).AndReturn(self.context(conn))
        self.mox.StubOutWithMock(queuer.db, 'cursor')
        cur = self.mox.CreateMockAnything()
        conn.get_current_timestamp().AndReturn(1)
        self.tt.update()
        self.mox.StubOutWithMock(queuer.QueueChunksWorker, 'get_column_metadata')
        queuer.QueueChunksWorker.get_column_metadata(conn)
        conn.from_unixtime_value(12345).AndReturn('2042T')

        db.cursor(conn).AndReturn(self.context(cur))
        cur.execute(mox.IgnoreArg(), mox.IgnoreArg())
        cur.fetchone().AndReturn((1, 2, 3))

        self.mox.ReplayAll()
        self.config.migration_type = orm.MigrationType.DELTA
        worker = queuer.QueueChunksWorker(self.config, self.redis)
        worker.column_map = {'a': {}, 'b': {}, 'c': {}}
        worker.get_table_metadata()
def trim_data(conn, sql, trim_time_str, shard_config, table_name, delete, partition_val):
    num_rows = get_count(conn, sql)
    if delete:
        print('Deleting %r from %s.%s older than %s%s' % (
            num_rows, shard_config['default_schema_name'], table_name, trim_time_str,
            '' if partition_val is None else ' for partition %s' % (partition_val,)
        ))
        with db.cursor(conn) as cur:
            cur.execute('DELETE ' + sql)
    else:
        print('%r records to delete from %s.%s older than %s%s' % (
            num_rows, shard_config['default_schema_name'], table_name, trim_time_str,
            '' if partition_val is None else ' for partition %s' % (partition_val,)
        ))
Exemple #11
0
    def get_table_metadata(self):
        with db.shard_connection(self.c.source_shard, read=True) as conn:
            self.table.source_start_time = conn.get_current_timestamp()
            self.table.start_time = int(time.time() * 1000)
            self.table.status = 'in_progress'
            self.table.update()

            self.get_column_metadata(conn)

            self.c.where_clauses, self.c.where_values = worker.generate_where(conn, self.c, self.c.table_config)

            if (
                self.c.migration_type == orm.MigrationType.DELTA
                and self.c.table_config.join
            ):
                # TODO: come up with a different way to do deltas for crate with a join clause. We don't need the chunking if we do it via json export
                if self.c.source_type == 'crate':
                    self.log_error(
                        'The %s table specifies a join clause but joins are not supported for crate due to lack of '
                        'aggregation support for JOIN queries. This table will not have any delta migrations '
                        'performed.',
                        self.c.table_config.table_name
                    )
                    return
                join = self.c.table_config.join % {'schema': self.c.source_schema}
            else:
                join = ''

            sql = (
                'SELECT COUNT(*), MIN(%(chunk_col)s), MAX(%(chunk_col)s) '
                'FROM %(schema)s.%(table)s %(table_alias)s %(join)s %(where)s'
            ) % {
                'chunk_col': self.c.table_config.chunk_col,
                'schema': self.c.source_schema,
                'table': self.c.table_config.table_name,
                'table_alias': self.c.table_config.table_alias,
                # We only need the join clause for delta and direct currently
                'join': join,
                'where': (' WHERE ' + (' AND '.join(self.c.where_clauses))) if self.c.where_clauses else ''
            }
            with db.cursor(conn) as cur:
                cur.execute(sql % {'?': conn.PARAMETER_PLACEHOLDER}, self.c.where_values)
                (self.num_rows, self.min_id, self.max_id) = cur.fetchone()
        self.log('num_rows=%r min_id=%r max_id=%r', self.num_rows, self.min_id, self.max_id)
 def _import_to_mysql(self):
     with db.shard_connection(self.c.destination_shard, read=False) as conn:
         self.log('Starting import')
         with db.cursor(conn) as cur:
             sql = (
                 (
                     "LOAD DATA INFILE '%(infile)s' "
                     # TODO(jpatrin): If the mysql source had timestamp fields with the special value
                     # '0000-00-00 00:00:00' then the imports will fail if the mysql server is set to be strict about
                     # timestamp types. In this case adding IGNORE below will fix the issue, but may mask other
                     # issues. If this issue recurs we should probably add special support to the migrator to handle
                     # the special '0000-00-00 00:00:00' timestamp value, similar to what is done for NULL.
                     # "IGNORE "
                     "INTO TABLE %(schema)s.%(table)s CHARACTER SET utf8 "
                     # We use | as an escape character as MySQL's default of \ is problematic and at some times is
                     # not compatible with the csv module's parser.
                     """FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '|' """
                     "LINES TERMINATED BY '\\n' "
                     '(%(columns)s) SET %(sets)s'
                 ) % {
                     'infile': self.c.import_filename,
                     'schema': self.c.destination_schema,
                     'table': self.c.table_config.table_name,
                     'columns': ', '.join(
                         ('' if col.ignore else '@') + col.name
                         for col in self.c.export_columns
                     ),
                     'sets': ', '.join(
                         "%s = IF(%s = '%s', NULL, %s)" % (
                             col.name,
                             '@' + col.name,
                             NULL_SENTINEL,
                             conn.column_insert_sql(col) % {'?': '@' + col.name}
                         )
                         for col in self.c.export_columns
                         if not col.ignore
                     ),
                 }
             )
             cur.execute(sql)
             self.c.num_records_imported = cur.rowcount
         conn.commit()
Exemple #13
0
    def get_column_metadata(self, conn):
        columns = []
        with db.cursor(conn, dictionary=True) as cur:
            cur.execute('DESCRIBE %s.%s' % (self.c.source_schema, self.c.table_config.table_name))
            for column in cur.fetchall():
                alias = '%s.' % (self.c.table_config.table_alias,) if self.c.table_config.table_alias else ''
                base_type = column['Type'].lower().split('(')[0]  # Ignore anything after an opening parenthesis
                col = db.Column(
                    column['Field'],
                    self.TYPE_MAP.get(base_type, db.ColumnType.STRING),
                    column['Key'] == 'PRI',
                    ignore=(column['Field'].lower() in self.c.table_config.ignore_columns),
                    source_alias=alias
                )

                columns.append(col)
                self.column_map[col.lname] = col

        if (
            self.c.migration_type == orm.MigrationType.DELTA
            or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT
        ):
            # Check the destination for the primary key columns as well since the schemas may be different
            with db.shard_connection(self.c.destination_shard, read=True) as conn:
                primary_key = conn.get_table_primary_key_columns(
                    self.c.destination_schema,
                    self.c.table_config.table_name
                )
            for col_name in primary_key:
                if col_name.lower() not in self.column_map:
                    raise UnrecoverableError(
                        'Primary key column in destination does not exist in source '
                        'table=%s column=%s source=%s destination=%s' % (
                            self.c.table_config.table_name, col_name, self.c.source_shard, self.c.destination_shard))
                self.column_map[col_name.lower()].is_primary_key = True
        self.c.columns = columns
    def check_table(self, table_config, conn):
        table_data = orm.Table.get(
            self.redis_conn,
            migration_id=self.c.migration_id,
            partition_val=self.c.partition_val, namespace=self.c.namespace,
            source_shard=self.c.source_shard, destination_shard=self.c.destination_shard,
            table_name=table_config.table_name)
        if table_data is None:
            return None
        with db.cursor(conn) as cur:
            # update the table_config so logging is correct
            self.c.table_config = table_config
            # TODO(jpatrin): Add join support for non-crate destinations

            # TODO(jpatrin): Disabling min and max checks for now as the query is different for crate vs. mysql
            if self.c.migration_type == orm.MigrationType.DELTA:
                if table_config.join:
                    self.log_warning('Verification is unsupported for tables in delta migrations with a join clause')
                    table_data.verification_status = 'unknown'
                    table_data.update()
                    return table_data
            elif self.c.migration_type != orm.MigrationType.FULL:
                raise shinkansen.UnrecoverableError('Migration type %r unknown' % (self.c.migration_type,))
            # TODO(jpatrin): The verifier should technically take the join clause into account so it gets the same
            # result as the queuer and exporter, but crate doesn't support joins with aggregation. As long as the
            # destination only has the records we have inserted into it the join shouldn't be needed, though.

            (where_clauses, where_values) = worker.generate_where(conn, self.c, table_config)

            sql = (
                'SELECT COUNT(*) '  # , MIN(%(chunk_col)s), MAX(%(chunk_col)s) '
                'FROM %(schema)s.%(table)s %(table_alias)s %(where)s'  # %(join)s
            ) % {
                # 'chunk_col': chunk_col,
                'schema': self.c.destination_schema,
                'table': table_config.table_name,
                'table_alias': table_config.table_alias,
                # 'join': (self.c.table_config.join % {'schema': self.c.destination_schema}
                #          if self.c.migration_type == orm.MigrationType.DELTA
                #          or self.c.chunk_migration_type == orm.ChunkMigrationType.DIRECT
                #          else ''),
                'where': (' WHERE ' + (' AND '.join(where_clauses))) if where_clauses else ''
            }
            cur.execute(sql % {'?': conn.PARAMETER_PLACEHOLDER}, where_values)
            #(num_rows, min_id, max_id) = cur.fetchone()
            (num_rows,) = cur.fetchone()
        errors = []
        if table_data.num_records != num_rows:
            errors.append('The queued number of rows (%r) and the resulting number of rows (%r) do not match' % (
                table_data.num_records, num_rows))
        #if table_data.min_id != min_id:
        #    errors.append('The queued min_id (%r) and the resulting min_id (%r) do not match' % (
        #        table_data.min_id, min_id))
        #if table_data.max_id != max_id:
        #    errors.append('The queued max_id (%r) and the resulting max_id (%r) do not match' % (
        #        table_data.max_id, max_id))
        if len(errors) > 0:
            self.log_error('Verification errors: %s', ', '.join(errors))
            table_data.verification_status = 'failed'
        else:
            self.log('Verification succeeded')
            table_data.verification_status = 'verified'
        table_data.update()
        return table_data
Exemple #15
0
    def _run(self):
        time.sleep(random.randint(0, 10))
        start = datetime.now()
        if self.c.source_type != 'mysql':
            raise UnrecoverableError('This exporter only supports mysql sources, passed in source_type was %r' % (
                self.c.source_type,))
        self.chunk.status = 'exporting'
        self.chunk.start_time = int(time.time() * 1000)
        self.chunk.update()
        source_host = config.SOURCES[self.c.source_shard]['config']['read_host']['host']
        with db.shard_connection(self.c.source_shard, read=True) as mysql_conn:
            self.log('Starting export')

            mysql_host_user = '******' % (
                config.SSH_USER, source_host)
            cmd = (
                'sudo mkdir -p %(source_dir)s && sudo chmod 777 %(source_dir)s; '
                '[ ! -e %(outfile)s ] || sudo rm %(outfile)s'
            ) % {
                'tmp_dir': config.TMP_DIR,
                'source_dir': worker.SOURCE_DIR,
                'outfile': self.c.export_filename,
            }
            sshcmd = (
                'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR '
                '-o ControlMaster=no -o ControlPath=none '
                '-p %r -i %s %s "%s"' % (
                    config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, mysql_host_user, ssh.escape_double_quotes(cmd)))
            rm_existing = subprocess.Popen(
                sshcmd,
                shell=True,
                stdin=subprocess.PIPE)
            rm_existing.stdin.close()
            rm_existing.wait()
            if rm_existing.returncode != 0:
                raise CommandException('Checking for and removing export file failed with exit code %r' % (
                    rm_existing.returncode,))


            # Inline trimming may be in self.c.where_clauses
            wheres = copy.deepcopy(self.c.where_clauses)
            where_values = copy.deepcopy(self.c.where_values)
            wheres.extend([
                '%s >= %%(?)s' % (self.c.table_config.chunk_col,),
                '%s < %%(?)s' % (self.c.table_config.chunk_col,),
            ])
            where_values.extend([
                self.c.start_id,
                self.c.start_id + self.c.chunk_size,
            ])

            # We want to kick off the streamer here to enable streaming while the export file is still being written
            # but if we do and the export below fails the streamer will be stuck in an infinite loop and the next time
            # the export task gets retried we'll kick off yet another streamer task, potentially corrupting data.
            # TODO(jpatrin): A fix for this would be to use a random token so the streamer knows it belongs to the
            # running exporter. Before kicking off the streamer, write a file with a random UUID next to where the
            # exported file will be. Put the token in self.c so it gets passed to the streamer. When the streamer
            # starts up, read the token file and check it vs. the token in self.c. If it's different, mark the chunk
            # as failed, end the streamer without doing anything, and don't retry the task.
            #streamer.queue_stream_chunk(self.c)

            # TODO(jpatrin): Don't we need to add the join clause here to make the where_clauses work for inline
            # trimming?
            sql = (
                (
                    "SELECT %(columns)s INTO OUTFILE '%(outfile)s' CHARACTER SET utf8 "
                    # We use | as an escape character as MySQL's default of \ is problematic and at some times is not
                    # compatible with the csv module's parser.
                    """FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '|' """
                    "LINES TERMINATED BY '\\n' "
                    "FROM %(schema)s.%(table)s %(table_alias)s WHERE %(wheres)s"
                    # Adding sorting here only slows down the query.
                    # "ORDER BY %(chunk_col)s ASC"
                ) % {
                    'columns': ', '.join(
                        (
                            # We're using NULL_SENTINEL here because MySQL uses a nonstandard value for null
                            # in its OUTFILE which is not as easy to detect and convert as I'd like.
                            "IF(%s IS NULL, '%s', %s)" % (
                                col.name,
                                NULL_SENTINEL,
                                mysql_conn.column_query_sql(col)
                            )
                        ) for col in self.c.export_columns
                    ),
                    'outfile': self.c.export_filename,
                    'schema': self.c.source_schema,
                    'table': self.c.table_config.table_name,
                    'table_alias': self.c.table_config.table_alias,
                    'wheres': ' AND '.join(wheres),
                }
            )
            with db.cursor(mysql_conn) as cur:
                cur.execute(sql % {'?': mysql_conn.PARAMETER_PLACEHOLDER}, where_values)
                self.c.num_records_exported = cur.rowcount

        # kick off the streamer
        streamer.queue_stream_chunk(self.c)

        self.chunk.num_records_exported = self.c.num_records_exported
        self.chunk.export_elapsed_ms = int((datetime.now() - start).total_seconds() * 1000)
        self.chunk.update()
        # The streaming may or may not have started, so we only update the status if it's still set to exporting
        self.chunk.update_status('exported', where_status='exporting')

        # signal to the processor that we have reached the end of the data
        self.log('Signaling EOF to conversion')
        cmd = 'sudo bash -c "echo STOP >> %s"' % (
            ssh.escape_double_quotes(self.c.export_filename),
        )
        signal_stop = subprocess.Popen(
            'ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o LogLevel=ERROR '
            '-p %r -i %s %s@%s "%s"' % (
                config.MYSQL_SSH_PORT, config.SSH_PRIVKEY, config.SSH_USER,
                source_host,
                ssh.escape_double_quotes(cmd)),
            shell=True,
            stdin=subprocess.PIPE)
        signal_stop.stdin.close()
        signal_stop.wait()

        self.log('Finished chunk export num_records_exported=%s elapsed=%s',
                 self.c.num_records_exported, datetime.now() - start)
def get_count(conn, sql):
    with db.cursor(conn) as cur:
        cur.execute('SELECT COUNT(*) ' + sql)
        (num_rows,) = cur.fetchone()
    return num_rows
Exemple #17
0
    def _run(self):
        time.sleep(random.randint(0, 10))
        start = datetime.now()
        if self.c.source_type != 'crate':
            raise UnrecoverableError('This exporter only supports crate sources, passed in source_type was %r' % (
                self.c.source_type,))
        self.chunk.status = 'exporting'
        self.chunk.start_time = int(time.time() * 1000)
        self.chunk.update()
        with db.shard_connection(self.c.source_shard, read=True) as crate_conn:
            self.log('Starting export')

            self.c.export_dir = os.path.join(worker.SOURCE_DIR, ('%s_%s.%s_%s_%s' % (
                self.c.migration_id, self.c.source_schema, self.c.table_config.table_name, self.c.partition_val,
                self.c.chunk_num)))

            def make_source_dir(node):
                cmd = (
                    'sudo mkdir -p %(export_dir)s && sudo chmod 777 %(export_dir)s'
                ) % {
                    'export_dir': self.c.export_dir,
                }
                try:
                    ssh.SSHHost(
                        node['host'], node['ssh_port'], config.SSH_USER,
                        identity=config.SSH_PRIVKEY
                    ).run(cmd)
                except ssh.SSHException, e:
                    raise CommandException('Checking for and removing export file failed %r' % (e,))

            crate_cluster = config.DESTINATIONS[self.c.source_shard]
            data_nodes = crate_cluster['config']['data_nodes']
            threads = []
            for node in data_nodes:
                thread = threading.Thread(target=make_source_dir, args=(node,))
                thread.start()
                threads.append(thread)
            for thread in threads:
                thread.join()

            # Inline trimming may be in self.c.where_clauses
            wheres = copy.deepcopy(self.c.where_clauses)
            where_values = copy.deepcopy(self.c.where_values)
            wheres.extend([
                '%s >= %%(?)s' % (self.c.table_config.chunk_col,),
                '%s < %%(?)s' % (self.c.table_config.chunk_col,),
            ])
            where_values.extend([
                self.c.start_id,
                self.c.start_id + self.c.chunk_size,
            ])

            if self.c.table_config.join:
                # TODO: This is a HORRIBLE HACK that removes the hard-coded table alias from the partition and chunk
                # columns.
                # It is likely to cause problems. A refactoring of the join code is needed.
                column_alias = '%s.' % (self.c.table_config.table_alias,)
                fixed_wheres = []
                for where in wheres:
                    if where.startswith(column_alias):
                        where = where[len(column_alias):]
                    fixed_wheres.append(where)
                wheres = fixed_wheres

            # We want to kick off the streamer here to enable streaming while the export file is still being written
            # but if we do and the export below fails the streamer will be stuck in an infinite loop and the next time
            # the export task gets retried we'll kick off yet another streamer task, potentially corrupting data.
            # TODO(jpatrin): A fix for this would be to use a random token so the streamer knows it belongs to the
            # running exporter. Before kicking off the streamer, write a file with a random UUID next to where the
            # exported file will be. Put the token in self.c so it gets passed to the streamer. When the streamer
            # starts up, read the token file and check it vs. the token in self.c. If it's different, mark the chunk
            # as failed, end the streamer without doing anything, and don't retry the task.
            # NOTE: This would only work with crate if we append the STOP sentinel to all of the json files that each
            # node writes to export_dir or if we come up with another way to know when the export is finished.
            #streamer.queue_stream_chunk(self.c)

            # TODO(jpatrin): Don't we need to add the join clause here to make the where_clauses work for inline
            # trimming? They won't work with COPY TO, but we may need to figure out a way to support it.
            sql = (
                "COPY %(schema)s.%(table)s (%(columns)s) "
                "WHERE %(wheres)s "
                "TO DIRECTORY '%(export_dir)s' "
                "WITH (format='json_array') "
            ) % {
                'columns': ', '.join(col.name for col in self.c.export_columns),
                'schema': self.c.source_schema,
                'table': self.c.table_config.table_name,
                'wheres': ' AND '.join(wheres),
                'export_dir': self.c.export_dir,
            }
            #self.log_warning('%s', sql)
            #self.log_warning(repr(where_values))
            with db.cursor(crate_conn) as cur:
                cur.execute(sql % {'?': crate_conn.PARAMETER_PLACEHOLDER}, where_values)
                self.c.num_records_exported = cur.rowcount
Exemple #18
0
    def migrate(self, source_conn, dest_conn):
        # TODO: Implement option #2 as well as large numbers of records will not work with option #1 and crate.
        #       The python crate client always loads all result records into memory before returning anything
        #       (it doesn't support streaming results, like any normal DB API would), so we need to either support
        #       both of these options and switch between them at some threshold of record count or just use the second
        #       option.

        # When migrating from crate we need to either have an explicit limit or work around the implicit limit of
        # 10000 records. In order to make this work we need to do one of the following:
        #
        # 1) pre-query for the count and use that as the explicit limit (plus a fudge factor? multiplied?)
        #  * prone to errors if the number of records in the chunk would change between query time and the final
        #    SELECT. If the number of records increases enough between the COUNT and the SELECT in the result then
        #    the query could miss some of the records and not get picked up by a later autodelta as they would not
        #    have been updated.
        #  * Can mitigate by running the count then multiplying it by 2. Use that as the explicit LIMIT. Check the
        #    number of records we got vs. that limit. If we got exactly that number of records we need to try again
        #    with the limit doubled again.
        #    * This means we're re-doing all of the work but the possibility of this happening should be low enough
        #      that this only happens in extreme circumstances.
        #
        # 2) add an ORDER BY and use the ordered column to query for more records after each SELECT finishes.
        #  * adding ORDER BY slows down the query and adds load to the source database
        #  * prone to missing records which might have been inserted below any record we get on each loop
        #    i.e. assumes that the ordered field is an always increasing id field, like a mysql autoincrement id
        #  * using autodelta migrations (or a complete migration) should mean that any records potentially missed
        #    would be picked up by the delta migrations
        #  * ORDER BY and LIMIT only works if we have a unique column or primary key to use as an extra WHERE clause.
        #    Any primary key could potentially work but the problem is defining the where clause to get a part of the
        #    ordering.
        #    * This should be fixable by using ORDER BY and LIMIT with OFFSET, but this is less efficient than the extra
        #      WHERE clause as it means the server needs to scan the results to the OFFSET value each time.
        #
        # We need to implement #2 due to crate-python's inability to stream results. Very large results will not only
        # slow down the processing but are likely to cause memory errors in this module.
        #
        # XHGUTUYGJHGH crate can't sort by a partitioned column. If the primary key has a partition column then we
        # can't use the ORDER, LIMIT, WHERE pk > max option.
        # The only option in this case is to use ORDER, LIMIT, OFFSET while ordering only by the non-partition primary
        # key columns.
        #
        # What about a table where a single primary key column is also the partition column? We could potentially use
        # all indexed non-partition columns in the ORDER BY but this slows down the query.
        #
        # Maybe the right thing would just be to use COPY WHERE TO and stream the json files from the data nodes and
        # upsert them to the destination from there. That way we don't have to perform any heroics to get any data
        # out of crate. The downside, of course, is that we'll need to ssh to the data nodes to stream the files.
        # * Can use COPY table (columns...) WHERE ... TO DIRECTORY ... WITH (format='json_array') to reduce duplication
        #   of keys in the json.

        wheres = deepcopy(self.c.where_clauses) + [
            '%s >= %%(?)s' % (self.c.table_config.chunk_col,),
            '%s < %%(?)s' % (self.c.table_config.chunk_col,),
        ]
        base_values = deepcopy(self.c.where_values) + [
            self.c.start_id,
            self.c.start_id + self.c.chunk_size,
        ]

        if self.c.migration_type == orm.MigrationType.DELTA and self.c.table_config.join:
            if self.c.source_type == 'crate':
                self.log('Chunk cannot be piped, migration type is delta, table has a join, and source is crate. '
                         'Chunk will be exported and streamed instead.')
                self.chunk.status = 'queued'
                self.chunk.update()
                exporter.queue_export_chunk(self.c)
                return False
            join = self.c.table_config.join % {'schema': self.c.source_schema}
        else:
            join = ''

        base_sql = (
            'FROM %(schema)s.%(table)s %(table_alias)s %(join)s '
            'WHERE %(where_clauses)s'
        ) % {
            'schema': self.c.source_schema,
            'table': self.c.table_config.table_name,
            'table_alias': self.c.table_config.table_alias,
            'join': join,
            'where_clauses': ' AND '.join(wheres)
        }

        # crate has an implicit limit of 10000, we query for the count here to make sure we get all
        # of the records
        if self.c.source_type == 'crate':
            self.log('Querying for chunk size')
            sql = 'SELECT COUNT(*) %s' % (base_sql,)
            with db.cursor(source_conn) as source_cur:
                source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, base_values)
                (count,) = source_cur.fetchone()

            if not count:
                self.log('No data found for chunk')
                return True

            use_order = count > config.MAX_CRATE_RESULT_SIZE

            if use_order:
                (
                    primary_key_indexes,
                    primary_key_columns,
                ) = zip(*[
                    (
                        i,
                        col,
                    )
                    for (i, col) in enumerate(self.c.export_columns)
                    if col.is_primary_key
                ])
                use_offset = False
                if not primary_key_columns:
                    self.log_warning('Table has no primary key columns')
                    use_offset = True
                else:
                    unorderable_columns = [
                        col.lower() for col in
                        source_conn.get_unorderable_columns(
                            self.c.source_schema,
                            self.c.table_config.table_name
                        )
                    ]
                    if any(pkc.name.lower() in unorderable_columns for pkc in primary_key_columns):
                        self.log_warning('Table has primary key columns that cannot be used for sorting')
                        use_offset = True

                if use_offset:
                    self.log('Chunk cannot be piped, it must be exported and streamed')
                    self.chunk.status = 'queued'
                    self.chunk.update()
                    exporter.queue_export_chunk(self.c)
                    return False

                #     self.log_warning(
                #         'Falling back to full ordering and LIMIT OFFSET querying. Depending on the cardinality of the '
                #         'fields this may be very expensive or miss records depending on whether the sort order is '
                #         'deterministic.'
                #     )
                #     (
                #         key_indexes,
                #         key_columns
                #     ) = zip(*[
                #         (
                #             i,
                #             col,
                #         )
                #         for (i, col) in enumerate(self.c.export_columns)
                #         if col.name.lower() not in unorderable_columns
                #     ])

                else:
                    self.log(
                        'Chunk size (%u) is larger than the configured MAX_CRATE_RESULT_SIZE (%u). '
                        'This chunk will be broken up into multiple ordered queries.' % (
                            count,
                            config.MAX_CRATE_RESULT_SIZE,
                        )
                    )
                    key_indexes = primary_key_indexes
                    key_columns = primary_key_columns

                # if not key_columns:
                #     raise UnrecoverableError(
                #         'No sortable columns found, cannot migrate this chunk as it is larger '
                #         'than MAX_CRATE_RESULT_SIZE'
                #     )

                limit = config.MAX_CRATE_RESULT_SIZE
                order_sql = 'ORDER BY %s LIMIT %u' % (
                    ', '.join(col.name for col in key_columns),
                    limit
                )
            else:
                limit = count
        else:
            use_order = False

        self.chunk.num_records_exported = 0
        self.chunk.update()

        key_max_values = []

        while True:
            # TODO: Refactor?
            if use_order:
                if use_offset:
                    raise UnrecoverableError('Implement ORDER LIMIT OFFSET?')
                else:
                    if key_max_values:
                        # WHERE c1 > mv1 OR c1 == mv1 AND (c2 > mv2 OR c2 == mv2 AND c3 > mv3)
                        (
                            primary_key_sql,
                            primary_key_values,
                        ) = generate_primary_key_sql(key_columns, key_max_values)
                        loop_values = list(base_values) + list(primary_key_values)
                        loop_base_sql = '%s AND %s %s' % (
                            base_sql,
                            primary_key_sql,
                            order_sql,
                        )
                    else:
                        loop_base_sql = '%s %s' % (
                            base_sql,
                            order_sql,
                        )
                        loop_values = base_values
            elif self.c.source_type == 'crate':
                limit *= 2
                loop_base_sql = '%s LIMIT %u' % (base_sql, limit)
                loop_values = base_values
            else:
                loop_base_sql = base_sql
                loop_values = base_values

            sql = (
                'SELECT %s %s'
            ) % (
                ', '.join(
                    source_conn.column_query_sql(col)
                    for col in self.c.export_columns
                ),
                loop_base_sql,
            )
            if not use_order:
                self.chunk.num_records_exported = 0
                self.chunk.update()

            with db.cursor(source_conn) as source_cur:
                source_cur.execute(sql % {'?': source_conn.PARAMETER_PLACEHOLDER}, loop_values)
                num_recs = 0
                while True:
                    records = source_cur.fetchmany(config.PIPE_BULK_INSERT_SIZE)
                    if not records:
                        break
                    self.chunk.num_records_exported += len(records)
                    self.chunk.update()
                    num_recs += len(records)
                    if use_order and not use_offset:
                        new_key_max_values = []
                        for col_idx_idx in xrange(len(key_indexes)):
                            max_val = max(r[key_indexes[col_idx_idx]] for r in records)
                            if key_max_values:
                                max_val = max(max_val, key_max_values[col_idx_idx])
                            new_key_max_values.append(max_val)
                        key_max_values = new_key_max_values
                    self.log_debug('Got records from source num_records=%s', len(records))
                    self.upsert(dest_conn, records)

            if self.c.source_type == 'crate':
                if use_order:
                    if num_recs < limit:
                        self.log('Chunk finished num_recs=%u limit=%u', num_recs, limit)
                        break
                    else:
                        self.log(
                            'Chunk has more records key_max_values=%r num_recs=%u limit=%u',
                            key_max_values, num_recs, limit
                        )
                # if we got as many records as the limit set above it is likely there were more than that to get
                # so we need to loop and do it over with a higher limit
                elif self.chunk.num_records_exported < limit:
                    break
                else:
                    self.log(
                        'The number of records has grown more than double, retrying with double the limit limit=%u',
                        limit
                    )
            else:
                break
        return True