Example #1
0
 def __init__(self, config):
     self.nap_time = config['loop_wait']
     self.postgresql = Postgresql(config['postgresql'])
     self.ha = Ha(self.postgresql, self.get_dcs(self.postgresql.name, config))
     host, port = config['restapi']['listen'].split(':')
     self.api = RestApiServer(self, config['restapi'])
     self.skydns2 = config.get('skydns2')
     self.next_run = time.time()
     self.shutdown_member_ttl = 300
Example #2
0
 def set_up(self):
     os.system = os_system
     self.p = Postgresql({'name': 'test0', 'data_dir': 'data/test0', 'listen': '127.0.0.1, 127.0.0.2:5432', 'connect_address': '127.0.0.2:5432', 'replication': {
                         'username': '******', 'password': '******', 'network': '127.0.0.1/32'}, 'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'}})
     psycopg2.connect = psycopg2_connect
     if not os.path.exists(self.p.data_dir):
         os.makedirs(self.p.data_dir)
     self.leader = Member('leader', 'postgres://*****:*****@127.0.0.1:5434/postgres', 28)
Example #3
0
    def __init__(self, config):
        assert config["etcd"]["ttl"] > 2 * config["loop_wait"]

        self.nap_time = config['loop_wait']
        self.etcd = Etcd(config['etcd'])
        self.aws = AWSConnection(config)
        self.postgresql = Postgresql(config['postgresql'], self.aws.on_role_change)
        self.ha = Ha(self.postgresql, self.etcd)
        host, port = config['restapi']['listen'].split(':')
        self.api = RestApiServer(self, config['restapi'])
        self.next_run = time.time()
Example #4
0
def run(config):
    etcd = Etcd(config["etcd"])
    postgresql = Postgresql(config["postgresql"])
    try:
        from BaseHTTPServer import HTTPServer
        host, port = config["haproxy_status"]["listen"].split(":")
        server = HTTPServer((host, int(port)), handler(postgresql, etcd))
        logging.info('listening on %s:%s', host, port)
        server.serve_forever()

    except KeyboardInterrupt:
        print('^C received, shutting down server')
        server.socket.close()
Example #5
0
 def set_up(self):
     subprocess.call = subprocess_call
     shutil.copy = nop
     self.p = Postgresql({'name': 'test0', 'scope': 'batman', 'data_dir': 'data/test0',
                          'listen': '127.0.0.1, *:5432', 'connect_address': '127.0.0.2:5432',
                          'pg_hba': ['hostssl all all 0.0.0.0/0 md5', 'host all all 0.0.0.0/0 md5'],
                          'superuser': {'password': ''},
                          'admin': {'username': '******', 'password': '******'},
                          'replication': {'username': '******',
                                          'password': '******',
                                          'network': '127.0.0.1/32'},
                          'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'},
                          'callbacks': {'on_start': 'true', 'on_stop': 'true',
                                        'on_restart': 'true', 'on_role_change': 'true',
                                        'on_reload': 'true'
                                        },
                          'restore': 'true'})
     psycopg2.connect = psycopg2_connect
     if not os.path.exists(self.p.data_dir):
         os.makedirs(self.p.data_dir)
     self.leader = Member(0, 'leader', 'postgres://*****:*****@127.0.0.1:5435/postgres', None, None, 28)
     self.other = Member(0, 'test1', 'postgres://*****:*****@127.0.0.1:5433/postgres', None, None, 28)
     self.me = Member(0, 'test0', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, None, 28)
Example #6
0
 def set_up(self):
     subprocess.call = subprocess_call
     shutil.copy = nop
     self.p = Postgresql({'name': 'test0', 'data_dir': 'data/test0', 'listen': '127.0.0.1, *:5432',
                          'connect_address': '127.0.0.2:5432',
                          'pg_hba': ['hostssl all all 0.0.0.0/0 md5', 'host all all 0.0.0.0/0 md5'],
                          'superuser': {'password': ''}, 'admin': {'username': '******', 'password': '******'},
                          'replication': {'username': '******',
                                          'password': '******',
                                          'network': '127.0.0.1/32'},
                          'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'}},
                          on_change_callback=lambda state: True)
     psycopg2.connect = psycopg2_connect
     if not os.path.exists(self.p.data_dir):
         os.makedirs(self.p.data_dir)
     self.leader = Member('leader', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, 28)
Example #7
0
class Patroni:

    def __init__(self, config):
        self.nap_time = config['loop_wait']
        self.postgresql = Postgresql(config['postgresql'])
        self.ha = Ha(self.postgresql, self.get_dcs(self.postgresql.name, config))
        host, port = config['restapi']['listen'].split(':')
        self.api = RestApiServer(self, config['restapi'])
        self.skydns2 = config.get('skydns2')
        self.next_run = time.time()
        self.shutdown_member_ttl = 300

    @staticmethod
    def get_dcs(name, config):
        if 'etcd' in config:
            assert config['etcd']['ttl'] > 2 * config['loop_wait']

            return Etcd(name, config['etcd'])
        if 'zookeeper' in config:
            return ZooKeeper(name, config['zookeeper'])
        raise Exception('Can not find sutable configuration of distributed configuration store')

    def touch_member(self, ttl=None):
        connection_string = self.postgresql.connection_string + '?application_name=' + self.api.connection_string
        if self.ha.cluster:
            for m in self.ha.cluster.members:
                # Do not update member TTL when it is far from being expired
                if m.name == self.postgresql.name and m.real_ttl() > self.shutdown_member_ttl:
                    return True
        return self.ha.dcs.touch_member(connection_string, ttl)

    def initialize(self):
        # wait for etcd to be available
        while not self.touch_member():
            logger.info('waiting on DCS')
            sleep(5)

        # is data directory empty?
        if self.postgresql.data_directory_empty():
            # racing to initialize
            if self.ha.dcs.race('/initialize'):
                self.postgresql.initialize()
                self.ha.dcs.take_leader()
                self.postgresql.start()
            else:
                while True:
                    leader = self.ha.dcs.current_leader()
                    if leader and self.postgresql.sync_from_leader(leader):
                        self.postgresql.write_recovery_conf(leader)
                        self.postgresql.start()
                        break
                    sleep(5)
        elif self.postgresql.is_running():
            self.postgresql.load_replication_slots()

    def schedule_next_run(self):
        self.next_run += self.nap_time
        current_time = time.time()
        nap_time = self.next_run - current_time
        if nap_time <= 0:
            self.next_run = current_time
        else:
            self.ha.dcs.sleep(nap_time)

    def run(self):
        self.api.start()
        self.next_run = time.time()

        while True:
            self.touch_member()
            logger.info(self.ha.run_cycle())
            try:
                if self.ha.state_handler.is_leader():
                    self.ha.cluster and self.ha.state_handler.create_replication_slots(self.ha.cluster)

                    # SkyDNS2 support: publish leader
                    if self.skydns2:
                        self.ha.dcs.client.set(self.skydns2['publish_leader'],
                            '{{"host": "{0}", "port": {1}}}'.format(*self.postgresql.connect_address), ttl=self.skydns2['ttl'])
                else:
                    self.ha.state_handler.drop_replication_slots()
            except:
                logger.exception('Exception when changing replication slots')
            reap_children()
            self.schedule_next_run()
Example #8
0
 def test_create_connection_users(self):
     cfg = self.p.config
     cfg['superuser']['username'] = '******'
     p = Postgresql(cfg)
     p.create_connection_users()
Example #9
0
class TestPostgresql(unittest.TestCase):

    def __init__(self, method_name='runTest'):
        self.setUp = self.set_up
        self.tearDown = self.tear_down
        super(TestPostgresql, self).__init__(method_name)

    def set_up(self):
        subprocess.call = subprocess_call
        shutil.copy = nop
        self.p = Postgresql({'name': 'test0', 'data_dir': 'data/test0', 'listen': '127.0.0.1, *:5432',
                             'connect_address': '127.0.0.2:5432',
                             'pg_hba': ['hostssl all all 0.0.0.0/0 md5', 'host all all 0.0.0.0/0 md5'],
                             'superuser': {'password': ''}, 'admin': {'username': '******', 'password': '******'},
                             'replication': {'username': '******',
                                             'password': '******',
                                             'network': '127.0.0.1/32'},
                             'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'}},
                             on_change_callback=lambda state: True)
        psycopg2.connect = psycopg2_connect
        if not os.path.exists(self.p.data_dir):
            os.makedirs(self.p.data_dir)
        self.leader = Member('leader', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, 28)

    def tear_down(self):
        shutil.rmtree('data')

    def test_data_directory_empty(self):
        self.assertTrue(self.p.data_directory_empty())

    def test_initialize(self):
        self.assertTrue(self.p.initialize())
        self.assertTrue(os.path.exists(os.path.join(self.p.data_dir, 'pg_hba.conf')))

    def test_start(self):
        self.assertFalse(self.p.start())
        self.p.is_running = is_running
        with open(os.path.join(self.p.data_dir, 'postmaster.pid'), 'w'):
            pass
        self.assertTrue(self.p.start())

    def test_sync_from_leader(self):
        self.assertTrue(self.p.sync_from_leader(self.leader))

    def test_follow_the_leader(self):
        self.p.demote(self.leader)
        self.p.follow_the_leader(None)
        self.p.demote(self.leader)
        self.p.follow_the_leader(self.leader)
        self.p.follow_the_leader(Member('leader', 'postgres://*****:*****@127.0.0.1:5435/postgres', None, 28))

    def test_create_replication_slots(self):
        self.p.start()
        me = Member('test0', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, 28)
        other = Member('test1', 'postgres://*****:*****@127.0.0.1:5433/postgres', None, 28)
        cluster = Cluster(True, self.leader, 0, [me, other, self.leader])
        self.p.create_replication_slots(cluster)

    def test_query(self):
        self.p.query('select 1')
        self.assertRaises(psycopg2.InterfaceError, self.p.query, 'InterfaceError')
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')
        self.p._connection.closed = 2
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')
        self.p._connection.closed = 2
        self.p.disconnect = false
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')

    def test_is_healthiest_node(self):
        leader = Member('leader', 'postgres://*****:*****@127.0.0.1:5435/postgres', None, 28)
        me = Member('test0', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, 28)
        other = Member('test1', 'postgres://*****:*****@127.0.0.1:5433/postgres', None, 28)
        cluster = Cluster(True, leader, 0, [me, other, leader])
        self.assertTrue(self.p.is_healthiest_node(cluster))
        self.p.is_leader = false
        self.assertFalse(self.p.is_healthiest_node(cluster))
        self.p.xlog_position = xlog_position
        self.assertTrue(self.p.is_healthiest_node(cluster))
        self.p.config['maximum_lag_on_failover'] = -2
        self.assertFalse(self.p.is_healthiest_node(cluster))

    def test_is_leader(self):
        self.p.is_promoted = True
        self.assertTrue(self.p.is_leader())
        self.assertFalse(self.p.is_promoted)

    def test_reload(self):
        self.assertTrue(self.p.reload())

    def test_is_healthy(self):
        self.assertTrue(self.p.is_healthy())
        self.p.is_running = is_running
        self.assertFalse(self.p.is_healthy())

    def test_promote(self):
        self.assertTrue(self.p.promote())

    def test_last_operation(self):
        self.assertEquals(self.p.last_operation(), 0)
Example #10
0
def run(config):
    etcd = Etcd(config["etcd"])
    postgresql = Postgresql(config["postgresql"])
    ha = Ha(postgresql, etcd)

    atexit.register(stop_postgresql, postgresql)
    logging.info("Governor Starting up")
# is data directory empty?
    if postgresql.data_directory_empty():
        logging.info("Governor Starting up: Empty Data Dir")
        # racing to initialize
        wait_for_etcd("cannot initialize member without ETCD", etcd, postgresql)
        if etcd.race("/initialize", postgresql.name):
            logging.info("Governor Starting up: Initialisation Race ... WON!!!")
            logging.info("Governor Starting up: Initialise Postgres")
            postgresql.initialize()
            logging.info("Governor Starting up: Initialise Complete")
            etcd.take_leader(postgresql.name)
            logging.info("Governor Starting up: Starting Postgres")
            postgresql.start()
        else:
            logging.info("Governor Starting up: Initialisation Race ... LOST")
            logging.info("Governor Starting up: Sync Postgres from Leader")
            synced_from_leader = False
            while not synced_from_leader:
                leader = etcd.current_leader()
                if not leader:
                    time.sleep(5)
                    continue
                if postgresql.sync_from_leader(leader):
                    logging.info("Governor Starting up: Sync Completed")
                    postgresql.write_recovery_conf(leader)
                    logging.info("Governor Starting up: Starting Postgres")
                    postgresql.start()
                    synced_from_leader = True
                else:
                    time.sleep(5)
    else:
        logging.info("Governor Starting up: Existing Data Dir")
        postgresql.follow_no_leader()
        logging.info("Governor Starting up: Starting Postgres")
        postgresql.start()

    wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
    logging.info("Governor Running: Starting Running Loop")
    while True:
        try:
            logging.info("Governor Running: %s" % ha.run_cycle())

            # create replication slots
            if postgresql.is_leader():
                logging.info("Governor Running: I am the Leader")
                for node in etcd.get_client_path("/members?recursive=true")["node"]["nodes"]:
                    member = node["key"].split('/')[-1]
                    if member != postgresql.name:
                        postgresql.query("DO LANGUAGE plpgsql $$DECLARE somevar VARCHAR; BEGIN SELECT slot_name INTO somevar FROM pg_replication_slots WHERE slot_name = '%(slot)s' LIMIT 1; IF NOT FOUND THEN PERFORM pg_create_physical_replication_slot('%(slot)s'); END IF; END$$;" % {"slot": member})
            etcd.touch_member(postgresql.name, postgresql.connection_string)

            time.sleep(config["loop_wait"])
        except urllib2.URLError:
            logging.info("Lost connection to etcd, setting no leader and waiting on etcd")
            postgresql.follow_no_leader()
            wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
Example #11
0
def run(config):
    etcd = Etcd(config["etcd"])
    postgresql = Postgresql(config["postgresql"])
    ha = Ha(postgresql, etcd)

    atexit.register(stop_postgresql, postgresql)
    logging.info("Governor Starting up")
    # is data directory empty?
    if postgresql.data_directory_empty():
        logging.info("Governor Starting up: Empty Data Dir")
        # racing to initialize
        wait_for_etcd("cannot initialize member without ETCD", etcd,
                      postgresql)
        if etcd.race("/initialize", postgresql.name) or not etcd.members():
            logging.info(
                "Governor Starting up: Initialisation Race ... WON!!!")
            logging.info("Governor Starting up: Initialise Postgres")
            postgresql.initialize()
            logging.info("Governor Starting up: Initialise Complete")
            etcd.take_leader(postgresql.name)
            logging.info("Governor Starting up: Starting Postgres")
            postgresql.start()
        else:
            logging.info("Governor Starting up: Initialisation Race ... LOST")
            logging.info("Governor Starting up: Sync Postgres from Leader")
            synced_from_leader = False
            while not synced_from_leader:
                leader = etcd.current_leader()
                if not leader:
                    time.sleep(5)
                    continue
                if postgresql.sync_from_leader(leader):
                    logging.info("Governor Starting up: Sync Completed")
                    postgresql.write_recovery_conf(leader)
                    logging.info("Governor Starting up: Starting Postgres")
                    postgresql.start()
                    synced_from_leader = True
                else:
                    time.sleep(5)
    else:
        logging.info("Governor Starting up: Existing Data Dir")
        postgresql.follow_no_leader()
        logging.info("Governor Starting up: Starting Postgres")
        postgresql.start()

    wait_for_etcd(
        "running in readonly mode; cannot participate in cluster HA without etcd",
        etcd, postgresql)
    logging.info("Governor Running: Starting Running Loop")
    while True:
        try:
            ha.run_cycle()
            # create replication slots
            if postgresql.is_leader():
                logging.info("Governor Running: I am the Leader")
                for node in etcd.get_client_path(
                        "/members?recursive=true").get("node",
                                                       {}).get("nodes", []):
                    member = node["key"].split('/')[-1]
                    if member != postgresql.name:
                        postgresql.query(
                            "DO LANGUAGE plpgsql $$DECLARE somevar VARCHAR; BEGIN SELECT slot_name INTO somevar FROM pg_replication_slots WHERE slot_name = '%(slot)s' LIMIT 1; IF NOT FOUND THEN PERFORM pg_create_physical_replication_slot('%(slot)s'); END IF; END$$;"
                            % {"slot": member})
            etcd.touch_member(postgresql.name, postgresql.connection_string)

            time.sleep(config["loop_wait"])
        except urllib2.URLError:
            logging.info(
                "Lost connection to etcd, setting no leader and waiting on etcd"
            )
            postgresql.follow_no_leader()
            wait_for_etcd(
                "running in readonly mode; cannot participate in cluster HA without etcd",
                etcd, postgresql)
Example #12
0
class TestPostgresql(unittest.TestCase):

    def __init__(self, method_name='runTest'):
        self.setUp = self.set_up
        self.tearDown = self.tear_down
        super(TestPostgresql, self).__init__(method_name)

    def set_up(self):
        subprocess.call = subprocess_call
        shutil.copy = nop
        self.p = Postgresql({'name': 'test0', 'scope': 'batman', 'data_dir': 'data/test0',
                             'listen': '127.0.0.1, *:5432', 'connect_address': '127.0.0.2:5432',
                             'pg_hba': ['hostssl all all 0.0.0.0/0 md5', 'host all all 0.0.0.0/0 md5'],
                             'superuser': {'password': ''},
                             'admin': {'username': '******', 'password': '******'},
                             'replication': {'username': '******',
                                             'password': '******',
                                             'network': '127.0.0.1/32'},
                             'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'},
                             'callbacks': {'on_start': 'true', 'on_stop': 'true',
                                           'on_restart': 'true', 'on_role_change': 'true',
                                           'on_reload': 'true'
                                           },
                             'restore': 'true'})
        psycopg2.connect = psycopg2_connect
        if not os.path.exists(self.p.data_dir):
            os.makedirs(self.p.data_dir)
        self.leader = Member(0, 'leader', 'postgres://*****:*****@127.0.0.1:5435/postgres', None, None, 28)
        self.other = Member(0, 'test1', 'postgres://*****:*****@127.0.0.1:5433/postgres', None, None, 28)
        self.me = Member(0, 'test0', 'postgres://*****:*****@127.0.0.1:5434/postgres', None, None, 28)

    def tear_down(self):
        shutil.rmtree('data')

    def mock_query(self, p):
        raise psycopg2.OperationalError("not supported")

    def test_data_directory_empty(self):
        self.assertTrue(self.p.data_directory_empty())

    def test_initialize(self):
        self.assertTrue(self.p.initialize())
        self.assertTrue(os.path.exists(os.path.join(self.p.data_dir, 'pg_hba.conf')))

    def test_start_stop(self):
        self.assertFalse(self.p.start())
        self.p.is_running = is_running
        with open(os.path.join(self.p.data_dir, 'postmaster.pid'), 'w'):
            pass
        self.assertTrue(self.p.start())
        self.assertTrue(self.p.stop())

    def test_sync_from_leader(self):
        self.assertTrue(self.p.sync_from_leader(self.leader))

    def test_follow_the_leader(self):
        self.p.demote(self.leader)
        self.p.follow_the_leader(None)
        self.p.demote(self.leader)
        self.p.follow_the_leader(self.leader)
        self.p.follow_the_leader(self.other)

    def test_create_connection_users(self):
        cfg = self.p.config
        cfg['superuser']['username'] = '******'
        p = Postgresql(cfg)
        p.create_connection_users()

    def test_create_replication_slots(self):
        self.p.start()
        cluster = Cluster(True, self.leader, 0, [self.me, self.other, self.leader])
        self.p.create_replication_slots(cluster)

    def test_query(self):
        self.p.query('select 1')
        self.assertRaises(psycopg2.InterfaceError, self.p.query, 'InterfaceError')
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')
        self.p._connection.closed = 2
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')
        self.p._connection.closed = 2
        self.p.disconnect = false
        self.assertRaises(psycopg2.OperationalError, self.p.query, 'blabla')

    def test_is_healthiest_node(self):
        cluster = Cluster(True, self.leader, 0, [self.me, self.other, self.leader])
        self.assertTrue(self.p.is_healthiest_node(cluster))
        self.p.is_leader = false
        self.assertFalse(self.p.is_healthiest_node(cluster))
        self.p.xlog_position = lambda: 1
        self.assertTrue(self.p.is_healthiest_node(cluster))
        self.p.xlog_position = lambda: 2
        self.assertFalse(self.p.is_healthiest_node(cluster))
        self.p.config['maximum_lag_on_failover'] = -2
        self.assertFalse(self.p.is_healthiest_node(cluster))

    def test_is_leader(self):
        self.p.is_promoted = True
        self.assertTrue(self.p.is_leader())
        self.assertFalse(self.p.is_promoted)

    def test_reload(self):
        self.assertTrue(self.p.reload())

    def test_is_healthy(self):
        self.assertTrue(self.p.is_healthy())
        self.p.is_running = is_running
        self.assertFalse(self.p.is_healthy())

    def test_promote(self):
        self.assertTrue(self.p.promote())

    def test_last_operation(self):
        self.assertEquals(self.p.last_operation(), '0')

    def test_non_existing_callback(self):
        self.assertFalse(self.p.call_nowait('foobar'))

    def test_is_leader_exception(self):
        self.p.start()
        self.p.query = self.mock_query
        self.assertTrue(self.p.stop())
Example #13
0
    config['postgresql']['listen'] = os.getenv('GOVERNOR_POSTGRESQL_LISTEN')

if os.getenv('GOVERNOR_POSTGRESQL_READ_ONLY_PORT'):
    config['postgresql']['read_only_port'] = os.getenv(
        'GOVERNOR_POSTGRESQL_READ_ONLY_PORT')

if os.getenv('GOVERNOR_POSTGRESQL_DATA_DIR'):
    config['postgresql']['data_dir'] = os.getenv(
        'GOVERNOR_POSTGRESQL_DATA_DIR')

if os.getenv('GOVERNOR_POSTGRESQL_REPLICATION_NETWORK'):
    config['postgresql']['replication']['network'] = os.getenv(
        'GOVERNOR_POSTGRESQL_REPLICATION_NETWORK')

etcd = Etcd(config["etcd"])
postgresql = Postgresql(config["postgresql"])
ha = Ha(postgresql, etcd)


# leave things clean when shutting down, if possible
def shutdown(signal, frame):
    logging.info("Governor Shutting Down: Received Shutdown Signal")
    try:
        if ha.has_lock():
            logging.info("Governor Shutting Down: Abdicating Leadership")
            etcd.abdicate(postgresql.name)

        logging.info("Governor Shutting Down: Removing Membership")
        etcd.delete_member(postgresql.name)
    except:
        logging.exception("Error during Abdication")
Example #14
0
class Governor:

    def __init__(self, config):
        assert config["etcd"]["ttl"] > 2 * config["loop_wait"]

        self.nap_time = config['loop_wait']
        self.etcd = Etcd(config['etcd'])
        self.aws = AWSConnection(config)
        self.postgresql = Postgresql(config['postgresql'], self.aws.on_role_change)
        self.ha = Ha(self.postgresql, self.etcd)
        host, port = config['restapi']['listen'].split(':')
        self.api = RestApiServer(self, config['restapi'])
        self.next_run = time.time()

    def touch_member(self, ttl=None):
        connection_string = self.postgresql.connection_string + '?application_name=' + self.api.connection_string
        return self.etcd.touch_member(self.postgresql.name, connection_string, ttl)

    def initialize(self):
        # FIXME: isn't there a better way testing if etcd is writable?
        # wait for etcd to be available
        while not self.touch_member():
            logging.info('waiting on etcd')
            sleep(5)

        # is data directory empty?
        if self.postgresql.data_directory_empty():
            # racing to initialize
            if self.etcd.race('/initialize', self.postgresql.name):
                self.postgresql.initialize()
                self.etcd.take_leader(self.postgresql.name)
                self.postgresql.start()
            else:
                # FIXME: touch_member?
                while True:
                    leader = self.etcd.current_leader()
                    if leader and self.postgresql.sync_from_leader(leader):
                        self.postgresql.write_recovery_conf(leader)
                        self.postgresql.start()
                        break
                    sleep(5)
        elif self.postgresql.is_running():
            self.postgresql.load_replication_slots()

    def schedule_next_run(self):
        self.next_run += self.nap_time
        current_time = time.time()
        nap_time = self.next_run - current_time
        if nap_time <= 0:
            self.next_run = current_time
        else:
            sleep(nap_time)

    def run(self):
        self.api.start()
        self.next_run = time.time()

        while True:
            self.touch_member()
            logging.info(self.ha.run_cycle())

            self.schedule_next_run()
Example #15
0
#!/usr/bin/env python

from BaseHTTPServer import BaseHTTPRequestHandler
from helpers.etcd import Etcd
from helpers.postgresql import Postgresql
import sys, yaml, socket

f = open(sys.argv[1], "r")
config = yaml.load(f.read())
f.close()

etcd = Etcd(config["etcd"])
postgresql = Postgresql(config["postgresql"])


class StatusHandler(BaseHTTPRequestHandler):
    def do_GET(self):
        return self.do_ANY()

    def do_OPTIONS(self):
        return self.do_ANY()

    def do_ANY(self):
        if postgresql.name == etcd.current_leader()["hostname"]:
            self.send_response(200)
        else:
            self.send_response(503)
        self.end_headers()
        self.wfile.write('\r\n')
        return
Example #16
0
    config['postgresql']['listen'] = os.getenv('GOVERNOR_POSTGRESQL_LISTEN')

if os.getenv('GOVERNOR_POSTGRESQL_READ_ONLY_PORT'):
    config['postgresql']['read_only_port'] = os.getenv(
        'GOVERNOR_POSTGRESQL_READ_ONLY_PORT')

if os.getenv('GOVERNOR_POSTGRESQL_DATA_DIR'):
    config['postgresql']['data_dir'] = os.getenv(
        'GOVERNOR_POSTGRESQL_DATA_DIR')

if os.getenv('GOVERNOR_POSTGRESQL_REPLICATION_NETWORK'):
    config['postgresql']['replication']['network'] = os.getenv(
        'GOVERNOR_POSTGRESQL_REPLICATION_NETWORK')

etcd = Etcd(config["etcd"])
postgresql = Postgresql(config["postgresql"])
ha = Ha(postgresql, etcd)


# leave things clean when shutting down, if possible
def shutdown(signal, frame):
    logging.info("Governor Shutting Down: Received Shutdown Signal")
    try:
        if ha.has_lock():
            logging.info("Governor Shutting Down: Abdicating Leadership")
            etcd.abdicate(postgresql.name)

        logging.info("Governor Shutting Down: Removing Membership")
        etcd.delete_member(postgresql.name)
    except:
        logging.exception("Error during Abdication")
Example #17
0
#!/usr/bin/env python

import sys, os, yaml, time, urllib2, atexit, syslog
from socket import gethostname
from helpers.postgresql import Postgresql
from helpers.kms import Kms
from helpers.ec2 import Ec2

# add system path
psql_bin_path = "/usr/pgsql-9.4/bin"
os.environ['PATH'] += os.pathsep + psql_bin_path

# read the config
f = open(sys.argv[1], "r")
config = yaml.load(f.read())
f.close()

# kms is needed to decryot config
kms = Kms(config["kms"])

# configure the postgres
ec2 = Ec2()
our_ip = ec2.ec2_ip()
hostname = gethostname()
config["postgresql"]["name"] = hostname.split('.')[0]
config["postgresql"]["listen"] = our_ip + ":" + str(config["postgresql"]["port"])
postgresql = Postgresql(config["postgresql"], kms, hostname)

# start postgres
postgresql.start()
Example #18
0
def run(config):
    etcd = Etcd(config["etcd"])
    postgresql = Postgresql(config["postgresql"])
    ha = Ha(postgresql, etcd)

    atexit.register(stop_postgresql, postgresql)
    signal.signal(signal.SIGTERM, signalhandler)
    logging.info("Governor Starting up")
# is data directory empty?
    if postgresql.data_directory_empty():
        logging.info("Governor Starting up: Empty Data Dir")
        # racing to initialize
        wait_for_etcd("cannot initialize member without ETCD", etcd, postgresql)
        if etcd.race("/initialize", postgresql.name):
            logging.info("Governor Starting up: Initialisation Race ... WON!!!")
            logging.info("Governor Starting up: Initialise Postgres")
            postgresql.initialize()
            logging.info("Governor Starting up: Initialise Complete")
            etcd.take_leader(postgresql.name)
            logging.info("Governor Starting up: Starting Postgres")
            postgresql.start()
        else:
            logging.info("Governor Starting up: Initialisation Race ... LOST")
            logging.info("Governor Starting up: Sync Postgres from Leader")
            synced_from_leader = False
            while not synced_from_leader:
                leader = etcd.current_leader()
                if not leader:
                    time.sleep(5)
                    continue
                if postgresql.sync_from_leader(leader):
                    logging.info("Governor Starting up: Sync Completed")
                    postgresql.write_recovery_conf(leader)
                    logging.info("Governor Starting up: Starting Postgres")
                    postgresql.start()
                    synced_from_leader = True
                else:
                    time.sleep(5)
    else:
        logging.info("Governor Starting up: Existing Data Dir")
        postgresql.follow_no_leader()
        logging.info("Governor Starting up: Starting Postgres")
        postgresql.start()

    wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
    logging.info("Governor Running: Starting Running Loop")
    while True:
        try:
            logging.info("Governor Running: %s" % ha.run_cycle())

            # create replication slots
            if postgresql.is_leader():
                logging.info("Governor Running: I am the Leader")
            for node in etcd.members():
                member = node["hostname"]
                if member != postgresql.name:
                    if postgresql.is_leader():
                        postgresql.ensure_replication_slot(
                            postgresql.replication_slot_name(member)
                        )
                    else:
                        postgresql.drop_replication_slot(
                            postgresql.replication_slot_name(member)
                        )
            etcd.touch_member(postgresql.name, postgresql.connection_string)

            time.sleep(config["loop_wait"])
        except (urllib2.URLError, socket.timeout):
            logging.info("Lost connection to etcd, setting no leader and waiting on etcd")
            postgresql.follow_no_leader()
            wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
Example #19
0
import logging

from helpers.etcd import Etcd
from helpers.postgresql import Postgresql
from helpers.ha import Ha

LOG_LEVEL = logging.DEBUG if os.getenv('DEBUG', None) else logging.INFO

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=LOG_LEVEL)

f = open(sys.argv[1], "r")
config = yaml.load(f.read())
f.close()

etcd = Etcd(config["etcd"])
postgresql = Postgresql(config["postgresql"])
ha = Ha(postgresql, etcd)

# stop postgresql on script exit
def stop_postgresql():
    postgresql.stop()
atexit.register(stop_postgresql)

# wait for etcd to be available
def wait_for_etcd(message):
    etcd_ready = False
    while not etcd_ready:
        try:
            etcd.touch_member(postgresql.name, postgresql.connection_string)
            etcd_ready = True
        except urllib2.URLError:
Example #20
0
import sys, os, yaml, time, urllib2, atexit
import logging

from helpers.etcd import Etcd
from helpers.postgresql import Postgresql
from helpers.ha import Ha

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                    level=logging.INFO)

f = open(sys.argv[1], "r")
config = yaml.load(f.read())
f.close()

etcd = Etcd(config["etcd"])
postgresql = Postgresql(config["postgresql"])
ha = Ha(postgresql, etcd)


# stop postgresql on script exit
def stop_postgresql():
    postgresql.stop()


atexit.register(stop_postgresql)

# wait for etcd to be available
etcd_ready = False
while not etcd_ready:
    try:
        etcd.touch_member(postgresql.name, postgresql.connection_string)