Example #1
0
def healthcheck():
    if request.method == 'POST' and isinstance(g.source, Server):
        data = request.get_json()
        try:
            heartbeat = dt.datetime.strptime(data['heartbeat'],
                                             defaults.DATETIME_FORMAT)
        except:
            raise errors.InvalidDateFormat(data['heartbeat'],
                                           defaults.DATETIME_FORMAT)
        current_app.dm.cluster_manager.put(data['me'], heartbeat)

    catalog_ver = Catalog.max_catalog()
    data = {
        "version":
        dimensigon.__version__,
        "catalog_version":
        catalog_ver.strftime(defaults.DATEMARK_FORMAT)
        if catalog_ver else None,
        "services": [],
    }
    if not check_param_in_uri('human'):
        server = {'id': str(g.server.id), 'name': g.server.name}
        neighbours = [{
            'id': str(s.id),
            'name': s.name
        } for s in Server.get_neighbours()]
        cluster = {
            'alive': current_app.dm.cluster_manager.get_alive(),
            'in_coma': current_app.dm.cluster_manager.get_zombies()
        }
    else:
        server = g.server.name
        neighbours = sorted([s.name for s in Server.get_neighbours()])
        cluster = {
            'alive':
            sorted([
                getattr(Server.query.get(i), 'name', i)
                for i in current_app.dm.cluster_manager.get_alive()
            ]),
            'in_coma':
            sorted([
                getattr(Server.query.get(i), 'name', i)
                for i in current_app.dm.cluster_manager.get_zombies()
            ])
        }
    data.update(server=server,
                neighbours=neighbours,
                cluster=cluster,
                now=get_now().strftime(defaults.DATETIME_FORMAT))

    return data
Example #2
0
def cluster_in(server_id):
    user = User.get_current()
    data = request.get_json()
    if user and user.name == 'root':
        try:
            keepalive = dt.datetime.strptime(data.get('keepalive'),
                                             defaults.DATEMARK_FORMAT)
        except ValueError:
            raise errors.InvalidDateFormat(data.get('keepalive'),
                                           defaults.DATEMARK_FORMAT)
        current_app.dm.cluster_manager.put(server_id, keepalive)
        _cluster_logger.debug(
            f"{getattr(Server.query.get(server_id), 'name', server_id) or server_id} is a new alive server"
        )

        current_app.dm.route_manager.new_node_in_cluster(
            server_id, data['routes'])

        return {
            'cluster':
            current_app.dm.cluster_manager.get_cluster(
                defaults.DATEMARK_FORMAT),
            'neighbours': [s.id for s in Server.get_neighbours()]
        }, 200

    else:
        raise errors.UserForbiddenError
Example #3
0
async def _async_set_current_neighbours(
        neighbours: t.List[Server] = None,
        changed_routes: t.Dict[Server,
                               RouteContainer] = None) -> t.List[Server]:
    """Function checks and sets neighbours

    Args:
        neighbours: list of neighbours
        changed_routes: reference to a dict which will be populated with new routes

    Returns:
        list of servers which are not neighbours anymore
    """
    not_neighbours_anymore = []

    if neighbours is None:
        neighbours = Server.get_neighbours()

    if neighbours:
        resp = await asyncio.gather(
            *[async_check_gates(server) for server in neighbours])
        for route, server in zip(resp, neighbours):
            if isinstance(route, RouteContainer):
                server.set_route(route)
                if changed_routes is not None:
                    changed_routes[server] = route
            elif route is None:
                not_neighbours_anymore.append(server)
                rc = RouteContainer(None, None, None)
                server.set_route(rc)
                if changed_routes is not None:
                    changed_routes[server] = rc
    return not_neighbours_anymore
Example #4
0
 def _notify_cluster_out(self):
     with self.dm.flask_app.app_context():
         servers = Server.get_neighbours()
         if servers:
             self.logger.debug(
                 f"Sending shutdown to {', '.join([s.name for s in servers])}"
             )
         else:
             self.logger.debug("No server to send shutdown information")
         if servers:
             responses = asyncio.run(
                 ntwrk.parallel_requests(
                     servers,
                     'post',
                     view_or_url='api_1_0.cluster_out',
                     view_data=dict(server_id=str(Server.get_current().id)),
                     json={
                         'death':
                         get_now().strftime(defaults.DATEMARK_FORMAT)
                     },
                     timeout=2,
                     auth=get_root_auth()))
             if self.logger.level <= logging.DEBUG:
                 for r in responses:
                     if not r.ok:
                         self.logger.warning(
                             f"Unable to send data to {r.server}: {r}")
Example #5
0
    def test_get_neighbours_no_route(self):
        n1 = Server('n1', port=8000)
        me = Server('me', port=8000, me=True)

        db.session.add_all([n1, me])

        self.assertListEqual([], me.get_neighbours())
Example #6
0
    def test_get_neighbours(self):
        n1 = Server('n1', port=8000)
        n2 = Server('n2', port=8000)
        n3 = Server('n3', port=8000)
        r1 = Server('r1', port=8000)
        Route(destination=n1, cost=0)
        Route(destination=n2, proxy_server_or_gate=n2.gates[0])
        Route(destination=r1, proxy_server_or_gate=n1, cost=1)

        me = Server('me', port=8000, me=True)
        db.session.add_all([n1, n2, n3, r1, me])

        self.assertListEqual([n1, n2], me.get_neighbours())

        self.assertListEqual([n2], me.get_neighbours(exclude=n1))
        self.assertListEqual([n2], me.get_neighbours(exclude=[n1, n3]))
        self.assertListEqual([n2], me.get_neighbours(exclude=[n1.id, n3.id]))
Example #7
0
    async def _async_get_neighbour_healthcheck(self,
                                               cluster_heartbeat_id: str = None
                                               ) -> t.Dict[Server, dict]:

        server_responses = {}
        servers = Server.get_neighbours()
        self.logger.debug(
            f"Neighbour servers to check: {', '.join([s.name for s in servers])}"
        )

        auth = get_root_auth()
        if cluster_heartbeat_id is None:
            cluster_heartbeat_id = get_now().strftime(defaults.DATETIME_FORMAT)

        cos = [
            ntwrk.async_post(server,
                             'root.healthcheck',
                             json={
                                 'me': self.dm.server_id,
                                 'heartbeat': cluster_heartbeat_id
                             },
                             auth=auth) for server in servers
        ]
        responses = await asyncio.gather(*cos)
        for server, resp in zip(servers, responses):
            if resp.ok:
                id_response = resp.msg.get('server', {}).get('id', '')
                if id_response and str(server.id) != id_response:
                    e = HealthCheckMismatch(expected={
                        'id': str(server.id),
                        'name': server.name
                    },
                                            actual=resp.msg.get('server', {}))
                    self.logger.warning(str(e))
                else:
                    server_responses.update({server: resp.msg})
            else:
                self.logger.warning(
                    f"Unable to get Healthcheck from server {server.name}: {resp}"
                )
        return server_responses
Example #8
0
    def _notify_cluster_in(self):
        from dimensigon.domain.entities import Server
        import dimensigon.web.network as ntwrk
        from dimensigon.domain.entities import Parameter

        try:
            signaled = self._route_initiated.wait(timeout=120)
        except Exception:
            return

        if not signaled:
            self.logger.warning("Route Event not fired.")

        self.logger.debug("Notify Cluster")
        with self.dm.flask_app.app_context():
            not_notify = set()
            me = Server.get_current()

            msg = [
                r.to_json() for r in Route.query.options(
                    orm.lazyload(Route.destination), orm.lazyload(Route.gate),
                    orm.lazyload(Route.proxy_server)).all()
            ]

            neighbours = Server.get_neighbours()

            if Parameter.get('join_server', None):
                join_server = Server.query.get(Parameter.get('join_server'))
            else:
                join_server = None

            now = get_now()
            msg = dict(keepalive=now.strftime(defaults.DATEMARK_FORMAT),
                       routes=msg)
            if neighbours:
                random.shuffle(neighbours)
                first = [
                    s for s in neighbours
                    if s.id == Parameter.get('new_gates_server', None)
                ]
                if first:
                    neighbours.pop(neighbours.index(first[0]))
                    neighbours = first + neighbours
                elif join_server in neighbours:
                    neighbours.pop(neighbours.index(join_server))
                    neighbours = [join_server] + neighbours
                for s in neighbours:
                    if s.id not in not_notify:
                        self.logger.debug(
                            f"Sending 'Cluster IN' message to {s}")
                        resp = ntwrk.post(s,
                                          'api_1_0.cluster_in',
                                          view_data=dict(server_id=str(me.id)),
                                          json=msg,
                                          timeout=10,
                                          auth=get_root_auth())
                        if resp.ok:
                            converted = []
                            for ident, str_keepalive, death in resp.msg[
                                    'cluster']:
                                try:
                                    keepalive = dt.datetime.strptime(
                                        str_keepalive,
                                        defaults.DATEMARK_FORMAT)
                                except ValueError:
                                    continue
                                converted.append((ident, keepalive, death))
                            self.put_many(converted)
                            not_notify.update(resp.msg.get('neighbours', []))
                        else:
                            self.logger.debug(
                                f"Unable to send 'Cluster IN' message to {s} . Response: {resp}"
                            )
                    else:
                        self.logger.debug(
                            f"Skiping server {s} from sending 'Cluster IN' message"
                        )
                # alive = [(getattr(Server.query.get(s_id), 'name', None) or s_id) for s_id in
                #          self.get_alive()]
                # self.logger.info(f"Alive servers: {', '.join(alive)}")
            else:
                self.logger.debug("No neighbour to send 'Cluster IN'")
        self.logger.debug("Notify Cluster ended")
Example #9
0
    def _send_data(self):
        session = self.Session()

        def log_data(data):
            debug_data = []
            for cr in data:
                server = dict(id=cr.id)
                name = getattr(session.query(Server).get(cr.id), 'name', cr.id)
                if name:
                    server.update(name=name)

                debug_data.append({
                    'server':
                    server,
                    'keepalive':
                    cr.keepalive.strftime(defaults.DATEMARK_FORMAT),
                    'death':
                    cr.death
                })
            return debug_data

        # time to send data
        with self.dm.flask_app.app_context():
            neighbours = Server.get_neighbours(session=session)
            if neighbours:
                with self._change_buffer_lock:
                    temp_buffer = dict(self._buffer)
                    self._buffer.clear()

                self.logger.debug(
                    f"Sending cluster information to the following nodes: {', '.join([s.name for s in neighbours])}"
                )
                self.logger.log(
                    1,
                    f"{json.dumps(log_data(temp_buffer.values()), indent=2)}")

                auth = get_root_auth()
                try:
                    responses = asyncio.run(
                        ntwrk.parallel_requests(
                            neighbours,
                            'POST',
                            view_or_url='api_1_0.cluster',
                            json=[{
                                'id':
                                e.id,
                                'keepalive':
                                e.keepalive.strftime(defaults.DATEMARK_FORMAT),
                                'death':
                                e.death
                            } for e in temp_buffer.values()],
                            auth=auth,
                            securizer=False), )
                except Exception as e:
                    self.logger.error(
                        f"Unable to send cluster information to neighbours: {format_exception(e)}"
                    )
                    # restore data with new data arrived
                    with self._change_buffer_lock:
                        temp_buffer.update(**self._buffer)
                        self._buffer.clear()
                        self._buffer.update(temp_buffer)
                else:
                    for r in responses:
                        if not r.ok:
                            self.logger.warning(
                                f"Unable to send data to {r.server}: {r}")

                # check if new data arrived during timer execution
                with self._change_buffer_lock:
                    if self._buffer:
                        self._timer = threading.Timer(interval=1,
                                                      function=self._send_data)
                        self._timer.start()
                    else:
                        self._timer = None
            else:
                self.logger.debug(
                    f"No neighbour servers to send cluster information")
                with self._change_buffer_lock:
                    self._timer = None
        session.close()
Example #10
0
    async def _send_routes(self, exclude=None):

        servers = Server.get_neighbours(session=self.session)
        msg, debug_msg = self._format_routes_message(self._changed_routes)

        c_exclude = []
        if self.logger.level <= logging.DEBUG:
            if exclude:
                if is_iterable_not_string(exclude):
                    c_exclude = [
                        self.session.query(Server).get(e)
                        if not isinstance(e, Server) else e for e in exclude
                    ]
                else:
                    c_exclude = [
                        self.session.query(Server).get(exclude)
                        if not isinstance(exclude, Server) else exclude
                    ]
                log_msg = f" (Excluded nodes: {', '.join([getattr(e, 'name', e) for e in c_exclude])}):"
            else:
                log_msg = ''

            if servers:
                log_msg = f"Sending route information to the following nodes: {', '.join([s.name for s in servers])} " \
                          f"{log_msg}{json.dumps(debug_msg, indent=2)}"
            else:
                log_msg = f"No servers to send new routing information:{log_msg}{json.dumps(debug_msg, indent=2)}"
                if debug_msg:
                    log_msg += '\n' + json.dumps(debug_msg, indent=2)

            if debug_msg and (servers or exclude):
                self.logger.debug(log_msg)

        exclude_ids = list(
            set([s.id for s in servers
                 ]).union([getattr(e, 'id', e) for e in c_exclude]))

        auth = get_root_auth()
        aw = [
            ntwrk.async_patch(s,
                              view_or_url='api_1_0.routes',
                              json={
                                  'server_id': self.server.id,
                                  'route_list': msg,
                                  'exclude': exclude_ids
                              },
                              auth=auth) for s in servers
        ]

        rs = await asyncio.gather(*aw, return_exceptions=True)

        for r, s in zip(rs, servers):
            if isinstance(r, Exception):
                self.logger.warning(
                    f"Error while trying to send route data to node {s}: "
                    f"{format_exception(r)}")
            elif not r.ok:
                if r.exception:
                    self.logger.warning(
                        f"Error while trying to send route data to node {s}: "
                        f"{format_exception(r.exception)}")
                else:
                    self.logger.warning(
                        f"Error while trying to send route data to node {s}: {r}"
                    )
        self._changed_routes.clear()
Example #11
0
    async def _async_refresh_route_table(
            self,
            discover_new_neighbours=False,
            check_current_neighbours=False,
            max_num_discovery=None) -> t.Dict[Server, RouteContainer]:
        """Gets route tables of all neighbours and updates its own table based on jump weights.
        Needs a Flask App Context to run.

        Parameters
        ----------
        discover_new_neighbours:
            tries to discover new neighbours
        check_current_neighbours:
            checks if current neighbours are still neighbours
        max_num_discovery:
            maximum number of possible nodes to check as neighbour

        Returns
        -------
        None
        """

        self.logger.debug('Refresh Route Table')
        neighbours = Server.get_neighbours(session=self.session)
        not_neighbours = Server.get_not_neighbours(session=self.session)

        changed_routes: t.Dict[Server, RouteContainer] = {}

        not_neighbours_anymore = []
        new_neighbours = []

        aws = []
        if check_current_neighbours:
            if neighbours:
                self.logger.debug(f"Checking current neighbours: " +
                                  ', '.join([str(s) for s in neighbours]))
                aws.append(
                    _async_set_current_neighbours(neighbours, changed_routes))
            else:
                self.logger.debug(f"No neighbour to check")

        if discover_new_neighbours:
            if not_neighbours[:max_num_discovery]:
                rs = list(not_neighbours)
                random.shuffle(rs)
                target = rs[:max_num_discovery]
                target.sort(key=lambda s: s.name)
                self.logger.debug(
                    f"Checking new neighbours{f' (limited to {max_num_discovery})' if max_num_discovery else ''}: "
                    + ', '.join([str(s) for s in target]))
                aws.append(
                    _async_discover_new_neighbours(target, changed_routes))
            else:
                self.logger.debug("No new neighbours to check")

        res = await asyncio.gather(*aws, return_exceptions=False)

        if check_current_neighbours and neighbours:
            not_neighbours_anymore = res.pop(0)
            if not_neighbours_anymore:
                self.logger.info(
                    f"Lost direct connection to the following nodes: " +
                    ', '.join([str(s) for s in not_neighbours_anymore]))
        if discover_new_neighbours and not_neighbours[:max_num_discovery]:
            new_neighbours = res.pop(0)
            if new_neighbours:
                self.logger.info(f'New neighbours found: ' +
                                 ', '.join([str(s) for s in new_neighbours]))
            else:
                self.logger.debug("No new neighbours found")

        # remove routes whose proxy_server is a node that is not a neighbour
        query = self.session.query(Route).filter(
            Route.proxy_server_id.in_([
                s.id for s in list(
                    set(not_neighbours).union(set(not_neighbours_anymore)))
            ]))
        rc = RouteContainer(None, None, None)
        for route in query.all():
            route.set_route(rc)
            changed_routes[route.destination] = rc
        self.session.commit()

        # update neighbour lis

        neighbours = list(
            set(neighbours).union(set(new_neighbours)) -
            set(not_neighbours_anymore))

        if neighbours:
            self.logger.debug(
                f"Getting routing tables from {', '.join([str(s) for s in neighbours])}"
            )
            responses = await asyncio.gather(*[
                ntwrk.async_get(server, 'api_1_0.routes', auth=get_root_auth())
                for server in neighbours
            ])

            cr = self._route_table_merge(dict(zip(neighbours, responses)))
            changed_routes.update(cr)

        return changed_routes
Example #12
0
    def _route_table_merge(self, data: t.Dict[Server, ntwrk.Response]):
        changed_routes: t.Dict[Server, RouteContainer] = {}
        temp_table_routes: t.Dict[uuid.UUID, t.List[RouteContainer]] = {}
        for s, resp in data.items():
            if resp.code == 200:
                server_id = resp.msg.get(
                    'server_id', None) or resp.msg.get('server').get('id')
                likely_proxy_server_entity = self.session.query(Server).get(
                    server_id)
                for route_json in resp.msg['route_list']:
                    route_json = convert(route_json)
                    if route_json.destination_id != self.server.id \
                            and route_json.proxy_server_id != self.server.id \
                            and route_json.gate_id not in [g.id for g in self.server.gates]:
                        if route_json.destination_id not in temp_table_routes:
                            temp_table_routes.update(
                                {route_json.destination_id: []})
                        if route_json.cost is not None:
                            route_json.cost += 1
                            route_json.proxy_server_id = likely_proxy_server_entity.id
                            route_json.gate_id = None
                            temp_table_routes[
                                route_json.destination_id].append(
                                    RouteContainer(
                                        likely_proxy_server_entity.id, None,
                                        route_json.cost))
                        elif route_json.cost is None:
                            # remove a routing if gateway cannot reach the destination
                            temp_table_routes[
                                route_json.destination_id].append(
                                    RouteContainer(route_json.proxy_server_id,
                                                   None, None))
            else:
                self.logger.error(
                    f"Error while connecting with {s}. Error: {resp}")

        # Select new routes based on neighbour routes
        neighbour_ids = [
            s.id for s in Server.get_neighbours(session=self.session)
        ]
        for destination_id in filter(lambda s: s not in neighbour_ids,
                                     temp_table_routes.keys()):
            route = self.session.query(Route).filter_by(
                destination_id=destination_id).one_or_none()
            if not route:
                server = self.session.query(Server).get(destination_id)
                if not server:
                    continue
                else:
                    route = Route(destination=server)
            temp_table_routes[destination_id].sort(
                key=lambda x: x.cost or MAX_COST)
            if len(temp_table_routes[destination_id]) > 0:
                min_route = temp_table_routes[destination_id][0]
                proxy_server: Server = self.session.query(Server).get(
                    min_route.proxy_server)
                cost = min_route.cost
                if route.proxy_server != proxy_server or route.cost != cost:
                    rc = RouteContainer(proxy_server, None, cost)
                    route.set_route(rc)
                    changed_routes[route.destination] = rc
                    self.session.add(route)

        data = {}
        for server, temp_route in changed_routes.items():
            data.update({
                str(server): {
                    'proxy_server': str(temp_route.proxy_server),
                    'gate': str(temp_route.gate),
                    'cost': str(temp_route.cost)
                }
            })
        return changed_routes
Example #13
0
    def bootstrap(self):
        """ bootstraps the application. Gunicorn is still not listening on sockets
        """
        with self.app_context():
            from dimensigon.domain.entities import Server, Parameter
            import dimensigon.web.network as ntwrk
            from dimensigon.domain.entities import Locker

            # reset scopes
            Locker.set_initial(unlock=True)

            # check gates
            me = Server.get_current()
            if me is None:
                raise RuntimeError("No server set as 'current'")

            input_gates = bind2gate(self.dm.config.http_conf.get('bind'))
            current_gates = [(gate.dns or str(gate.ip), gate.port)
                             for gate in me.gates]
            new_gates = set(input_gates).difference(set(current_gates))
            self.server_id_with_new_gates = None
            if new_gates:
                if Parameter.get('join_server'):
                    join_server = Server.query.get(
                        Parameter.get('join_server'))
                else:
                    join_server = None
                servers = Server.get_neighbours()
                if join_server in servers:
                    servers.pop(servers.index(join_server))
                    servers.append(join_server)
                else:
                    self.logger.warning(
                        f'Join server {join_server} is not a neighbour')
                start = time.time()
                resp = None
                server = True
                while len(servers) > 0 and server and (time.time() -
                                                       start) < 900:
                    server_retries = 0
                    server = servers[-1]
                    self.logger.debug(
                        f"Sending new gates {new_gates} to {server}...")
                    resp = ntwrk.patch(
                        server,
                        'api_1_0.serverresource',
                        view_data=dict(server_id=str(Server.get_current().id)),
                        json={
                            'gates': [{
                                'dns_or_ip': ip,
                                'port': port
                            } for ip, port in new_gates]
                        },
                        timeout=60,
                        auth=get_root_auth())
                    if not resp.ok:
                        self.logger.debug(
                            f"Unable to send new gates to {server}. Reason: {resp}"
                        )
                        self.logger.info(
                            f"Unable to create new gates. Trying to send again in 5 seconds..."
                        )
                        time.sleep(5)
                        if resp.code == 409:
                            # try with the same server
                            server_retries += 1
                        elif resp.code == 500:

                            # try with another server
                            i = servers.index(server) - 1
                            if i >= 0:
                                server = servers[i]
                                server_retries = 0
                            else:
                                server = None
                        if server_retries == 3:
                            # changing server
                            i = servers.index(server) - 1
                            if i >= 0:
                                server = servers[i]
                                server_retries = 0
                            else:
                                server = None
                    else:
                        self.logger.debug("New gates created succesfully")
                        Parameter.set('new_gates_server', server.id)
                        break

                if not servers:
                    if Server.query.count() == 1:
                        self.logger.info(
                            f"Creating new gates {new_gates} without performing a lock on catalog"
                        )
                        for gate in new_gates:
                            g = me.add_new_gate(gate[0], gate[1])
                            db.session.add(g)

                else:
                    if resp and not resp.ok:
                        self.logger.warning(
                            f"Remote servers may not connect with {me}. ")
                db.session.commit()