def _get_new_device_messages(self): last_device_stream_id = self._last_device_stream_id to_device_stream_id = self._store.get_to_device_stream_token() contents, stream_id = yield self._store.get_new_device_msgs_for_remote( self._destination, last_device_stream_id, to_device_stream_id) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type="m.direct_to_device", content=content, ) for content in contents ] last_device_list = self._last_device_list_stream_id now_stream_id, results = yield self._store.get_devices_by_remote( self._destination, last_device_list) edus.extend( Edu( origin=self._server_name, destination=self._destination, edu_type="m.device_list_update", content=content, ) for content in results) defer.returnValue((edus, stream_id, now_stream_id))
def _get_new_device_messages(self, limit): last_device_list = self._last_device_list_stream_id # Will return at most 20 entries now_stream_id, results = yield self._store.get_devices_by_remote( self._destination, last_device_list) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type="m.device_list_update", content=content, ) for content in results ] assert len( edus) <= limit, "get_devices_by_remote returned too many EDUs" last_device_stream_id = self._last_device_stream_id to_device_stream_id = self._store.get_to_device_stream_token() contents, stream_id = yield self._store.get_new_device_msgs_for_remote( self._destination, last_device_stream_id, to_device_stream_id, limit - len(edus), ) edus.extend( Edu( origin=self._server_name, destination=self._destination, edu_type="m.direct_to_device", content=content, ) for content in contents) defer.returnValue((edus, stream_id, now_stream_id))
def build_and_send_edu( self, destination: str, edu_type: str, content: dict, key: Optional[Hashable] = None, ): """Construct an Edu object, and queue it for sending Args: destination: name of server to send to edu_type: type of EDU to send content: content of EDU key: clobbering key for this edu """ if destination == self.server_name: logger.info("Not sending EDU to ourselves") return if not self._federation_shard_config.should_handle( self._instance_name, destination): return edu = Edu( origin=self.server_name, destination=destination, edu_type=edu_type, content=content, ) self.send_edu(edu, key)
async def _get_to_device_message_edus(self, limit: int) -> Tuple[List[Edu], int]: last_device_stream_id = self._last_device_stream_id to_device_stream_id = self._store.get_to_device_stream_token() contents, stream_id = await self._store.get_new_device_msgs_for_remote( self._destination, last_device_stream_id, to_device_stream_id, limit) for content in contents: message_id = content.get("message_id") if not message_id: continue set_tag(SynapseTags.TO_DEVICE_MESSAGE_ID, message_id) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type=EduTypes.DIRECT_TO_DEVICE, content=content, ) for content in contents ] if edus: issue9533_logger.debug( "Sending %i to-device messages to %s, up to stream id %i", len(edus), self._destination, stream_id, ) return edus, stream_id
async def _process_edu(edu_dict): received_edus_counter.inc() edu = Edu( origin=origin, destination=self.server_name, edu_type=edu_dict["edu_type"], content=edu_dict["content"], ) await self.registry.on_edu(edu.edu_type, origin, edu.content)
def _get_rr_edus(self, force_flush: bool) -> Iterable[Edu]: if not self._pending_rrs: return if not force_flush and not self._rrs_pending_flush: # not yet time for this lot return edu = Edu( origin=self._server_name, destination=self._destination, edu_type="m.receipt", content=self._pending_rrs, ) self._pending_rrs = {} self._rrs_pending_flush = False yield edu
def _get_to_device_message_edus(self, limit): last_device_stream_id = self._last_device_stream_id to_device_stream_id = self._store.get_to_device_stream_token() contents, stream_id = yield self._store.get_new_device_msgs_for_remote( self._destination, last_device_stream_id, to_device_stream_id, limit) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type="m.direct_to_device", content=content, ) for content in contents ] return (edus, stream_id)
def _get_device_update_edus(self, limit): last_device_list = self._last_device_list_stream_id # Retrieve list of new device updates to send to the destination now_stream_id, results = yield self._store.get_devices_by_remote( self._destination, last_device_list, limit=limit) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type="m.device_list_update", content=content, ) for content in results ] assert len( edus) <= limit, "get_devices_by_remote returned too many EDUs" defer.returnValue((edus, now_stream_id))
async def _get_device_update_edus(self, limit: int) -> Tuple[List[Edu], int]: last_device_list = self._last_device_list_stream_id # Retrieve list of new device updates to send to the destination now_stream_id, results = await self._store.get_device_updates_by_remote( self._destination, last_device_list, limit=limit ) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type=edu_type, content=content, ) for (edu_type, content) in results ] assert len(edus) <= limit, "get_device_updates_by_remote returned too many EDUs" return (edus, now_stream_id)
def build_and_send_edu(self, destination, edu_type, content, key=None): """Construct an Edu object, and queue it for sending Args: destination (str): name of server to send to edu_type (str): type of EDU to send content (dict): content of EDU key (Any|None): clobbering key for this edu """ if destination == self.server_name: logger.info("Not sending EDU to ourselves") return edu = Edu( origin=self.server_name, destination=destination, edu_type=edu_type, content=content, ) self.send_edu(edu, key)
async def _get_to_device_message_edus(self, limit: int) -> Tuple[List[Edu], int]: last_device_stream_id = self._last_device_stream_id to_device_stream_id = self._store.get_to_device_stream_token() contents, stream_id = await self._store.get_new_device_msgs_for_remote( self._destination, last_device_stream_id, to_device_stream_id, limit) for content in contents: message_id = content.get("message_id") if not message_id: continue set_tag(SynapseTags.TO_DEVICE_MESSAGE_ID, message_id) edus = [ Edu( origin=self._server_name, destination=self._destination, edu_type="m.direct_to_device", content=content, ) for content in contents ] return (edus, stream_id)
async def __aenter__(self) -> Tuple[List[EventBase], List[Edu]]: # First we calculate the EDUs we want to send, if any. # We start by fetching device related EDUs, i.e device updates and to # device messages. We have to keep 2 free slots for presence and rr_edus. limit = MAX_EDUS_PER_TRANSACTION - 2 device_update_edus, dev_list_id = await self.queue._get_device_update_edus( limit) if device_update_edus: self._device_list_id = dev_list_id else: self.queue._last_device_list_stream_id = dev_list_id limit -= len(device_update_edus) ( to_device_edus, device_stream_id, ) = await self.queue._get_to_device_message_edus(limit) if to_device_edus: self._device_stream_id = device_stream_id else: self.queue._last_device_stream_id = device_stream_id pending_edus = device_update_edus + to_device_edus # Now add the read receipt EDU. pending_edus.extend(self.queue._get_rr_edus(force_flush=False)) # And presence EDU. if self.queue._pending_presence: pending_edus.append( Edu( origin=self.queue._server_name, destination=self.queue._destination, edu_type=EduTypes.PRESENCE, content={ "push": [ format_user_presence_state( presence, self.queue._clock.time_msec()) for presence in self.queue._pending_presence.values() ] }, )) self.queue._pending_presence = {} # Finally add any other types of EDUs if there is room. pending_edus.extend( self.queue._pop_pending_edus(MAX_EDUS_PER_TRANSACTION - len(pending_edus))) while (len(pending_edus) < MAX_EDUS_PER_TRANSACTION and self.queue._pending_edus_keyed): _, val = self.queue._pending_edus_keyed.popitem() pending_edus.append(val) # Now we look for any PDUs to send, by getting up to 50 PDUs from the # queue self._pdus = self.queue._pending_pdus[:50] if not self._pdus and not pending_edus: return [], [] # if we've decided to send a transaction anyway, and we have room, we # may as well send any pending RRs if len(pending_edus) < MAX_EDUS_PER_TRANSACTION: pending_edus.extend(self.queue._get_rr_edus(force_flush=True)) if self._pdus: self._last_stream_ordering = self._pdus[ -1].internal_metadata.stream_ordering assert self._last_stream_ordering return self._pdus, pending_edus
async def _process_edu(edu_dict): received_edus_counter.inc() edu = Edu(**edu_dict) await self.registry.on_edu(edu.edu_type, origin, edu.content)
def _handle_incoming_transaction(self, origin, transaction, request_time): """ Process an incoming transaction and return the HTTP response Args: origin (unicode): the server making the request transaction (Transaction): incoming transaction request_time (int): timestamp that the HTTP request arrived at Returns: Deferred[(int, object)]: http response code and body """ response = yield self.transaction_actions.have_responded( origin, transaction) if response: logger.debug("[%s] We've already responded to this request", transaction.transaction_id) defer.returnValue(response) return logger.debug("[%s] Transaction is new", transaction.transaction_id) received_pdus_counter.inc(len(transaction.pdus)) origin_host, _ = parse_server_name(origin) pdus_by_room = {} for p in transaction.pdus: if "unsigned" in p: unsigned = p["unsigned"] if "age" in unsigned: p["age"] = unsigned["age"] if "age" in p: p["age_ts"] = request_time - int(p["age"]) del p["age"] event = event_from_pdu_json(p) room_id = event.room_id pdus_by_room.setdefault(room_id, []).append(event) pdu_results = {} # we can process different rooms in parallel (which is useful if they # require callouts to other servers to fetch missing events), but # impose a limit to avoid going too crazy with ram/cpu. @defer.inlineCallbacks def process_pdus_for_room(room_id): logger.debug("Processing PDUs for %s", room_id) try: yield self.check_server_matches_acl(origin_host, room_id) except AuthError as e: logger.warn( "Ignoring PDUs for room %s from banned server", room_id, ) for pdu in pdus_by_room[room_id]: event_id = pdu.event_id pdu_results[event_id] = e.error_dict() return for pdu in pdus_by_room[room_id]: event_id = pdu.event_id with nested_logging_context(event_id): try: yield self._handle_received_pdu(origin, pdu) pdu_results[event_id] = {} except FederationError as e: logger.warn("Error handling PDU %s: %s", event_id, e) pdu_results[event_id] = {"error": str(e)} except Exception as e: f = failure.Failure() pdu_results[event_id] = {"error": str(e)} logger.error( "Failed to handle PDU %s: %s", event_id, f.getTraceback().rstrip(), ) yield concurrently_execute( process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT, ) if hasattr(transaction, "edus"): for edu in (Edu(**x) for x in transaction.edus): yield self.received_edu(origin, edu.edu_type, edu.content) response = { "pdus": pdu_results, } logger.debug("Returning: %s", str(response)) yield self.transaction_actions.set_response(origin, transaction, 200, response) defer.returnValue((200, response))
async def _transaction_transmission_loop(self) -> None: pending_pdus = [] # type: List[EventBase] try: self.transmission_loop_running = True # This will throw if we wouldn't retry. We do this here so we fail # quickly, but we will later check this again in the http client, # hence why we throw the result away. await get_retry_limiter(self._destination, self._clock, self._store) pending_pdus = [] while True: # We have to keep 2 free slots for presence and rr_edus limit = MAX_EDUS_PER_TRANSACTION - 2 device_update_edus, dev_list_id = await self._get_device_update_edus( limit) limit -= len(device_update_edus) ( to_device_edus, device_stream_id, ) = await self._get_to_device_message_edus(limit) pending_edus = device_update_edus + to_device_edus # BEGIN CRITICAL SECTION # # In order to avoid a race condition, we need to make sure that # the following code (from popping the queues up to the point # where we decide if we actually have any pending messages) is # atomic - otherwise new PDUs or EDUs might arrive in the # meantime, but not get sent because we hold the # transmission_loop_running flag. pending_pdus = self._pending_pdus # We can only include at most 50 PDUs per transactions pending_pdus, self._pending_pdus = pending_pdus[: 50], pending_pdus[ 50:] pending_edus.extend(self._get_rr_edus(force_flush=False)) pending_presence = self._pending_presence self._pending_presence = {} if pending_presence: pending_edus.append( Edu( origin=self._server_name, destination=self._destination, edu_type="m.presence", content={ "push": [ format_user_presence_state( presence, self._clock.time_msec()) for presence in pending_presence.values() ] }, )) pending_edus.extend( self._pop_pending_edus(MAX_EDUS_PER_TRANSACTION - len(pending_edus))) while (len(pending_edus) < MAX_EDUS_PER_TRANSACTION and self._pending_edus_keyed): _, val = self._pending_edus_keyed.popitem() pending_edus.append(val) if pending_pdus: logger.debug( "TX [%s] len(pending_pdus_by_dest[dest]) = %d", self._destination, len(pending_pdus), ) if not pending_pdus and not pending_edus: logger.debug("TX [%s] Nothing to send", self._destination) self._last_device_stream_id = device_stream_id return # if we've decided to send a transaction anyway, and we have room, we # may as well send any pending RRs if len(pending_edus) < MAX_EDUS_PER_TRANSACTION: pending_edus.extend(self._get_rr_edus(force_flush=True)) # END CRITICAL SECTION success = await self._transaction_manager.send_new_transaction( self._destination, pending_pdus, pending_edus) if success: sent_transactions_counter.inc() sent_edus_counter.inc(len(pending_edus)) for edu in pending_edus: sent_edus_by_type.labels(edu.edu_type).inc() # Remove the acknowledged device messages from the database # Only bother if we actually sent some device messages if to_device_edus: await self._store.delete_device_msgs_for_remote( self._destination, device_stream_id) # also mark the device updates as sent if device_update_edus: logger.info("Marking as sent %r %r", self._destination, dev_list_id) await self._store.mark_as_sent_devices_by_remote( self._destination, dev_list_id) self._last_device_stream_id = device_stream_id self._last_device_list_stream_id = dev_list_id else: break except NotRetryingDestination as e: logger.debug( "TX [%s] not ready for retry yet (next retry at %s) - " "dropping transaction for now", self._destination, datetime.datetime.fromtimestamp( (e.retry_last_ts + e.retry_interval) / 1000.0), ) if e.retry_interval > 60 * 60 * 1000: # we won't retry for another hour! # (this suggests a significant outage) # We drop pending PDUs and EDUs because otherwise they will # rack up indefinitely. # Note that: # - the EDUs that are being dropped here are those that we can # afford to drop (specifically, only typing notifications, # read receipts and presence updates are being dropped here) # - Other EDUs such as to_device messages are queued with a # different mechanism # - this is all volatile state that would be lost if the # federation sender restarted anyway # dropping read receipts is a bit sad but should be solved # through another mechanism, because this is all volatile! self._pending_pdus = [] self._pending_edus = [] self._pending_edus_keyed = {} self._pending_presence = {} self._pending_rrs = {} except FederationDeniedError as e: logger.info(e) except HttpResponseException as e: logger.warning( "TX [%s] Received %d response to transaction: %s", self._destination, e.code, e, ) except RequestSendFailed as e: logger.warning("TX [%s] Failed to send transaction: %s", self._destination, e) for p in pending_pdus: logger.info("Failed to send event %s to %s", p.event_id, self._destination) except Exception: logger.exception("TX [%s] Failed to send transaction", self._destination) for p in pending_pdus: logger.info("Failed to send event %s to %s", p.event_id, self._destination) finally: # We want to be *very* sure we clear this after we stop processing self.transmission_loop_running = False
def _transaction_transmission_loop(self): pending_pdus = [] try: self.transmission_loop_running = True # This will throw if we wouldn't retry. We do this here so we fail # quickly, but we will later check this again in the http client, # hence why we throw the result away. yield get_retry_limiter(self._destination, self._clock, self._store) pending_pdus = [] while True: device_message_edus, device_stream_id, dev_list_id = ( yield self._get_new_device_messages()) # BEGIN CRITICAL SECTION # # In order to avoid a race condition, we need to make sure that # the following code (from popping the queues up to the point # where we decide if we actually have any pending messages) is # atomic - otherwise new PDUs or EDUs might arrive in the # meantime, but not get sent because we hold the # transmission_loop_running flag. pending_pdus = self._pending_pdus # We can only include at most 50 PDUs per transactions pending_pdus, self._pending_pdus = pending_pdus[: 50], pending_pdus[ 50:] pending_edus = [] pending_edus.extend(self._get_rr_edus(force_flush=False)) # We can only include at most 100 EDUs per transactions pending_edus.extend( self._pop_pending_edus(100 - len(pending_edus))) pending_edus.extend(self._pending_edus_keyed.values()) self._pending_edus_keyed = {} pending_edus.extend(device_message_edus) pending_presence = self._pending_presence self._pending_presence = {} if pending_presence: pending_edus.append( Edu( origin=self._server_name, destination=self._destination, edu_type="m.presence", content={ "push": [ format_user_presence_state( presence, self._clock.time_msec()) for presence in pending_presence.values() ] }, )) if pending_pdus: logger.debug( "TX [%s] len(pending_pdus_by_dest[dest]) = %d", self._destination, len(pending_pdus)) if not pending_pdus and not pending_edus: logger.debug("TX [%s] Nothing to send", self._destination) self._last_device_stream_id = device_stream_id return # if we've decided to send a transaction anyway, and we have room, we # may as well send any pending RRs if len(pending_edus) < 100: pending_edus.extend(self._get_rr_edus(force_flush=True)) # END CRITICAL SECTION success = yield self._transaction_manager.send_new_transaction( self._destination, pending_pdus, pending_edus) if success: sent_transactions_counter.inc() sent_edus_counter.inc(len(pending_edus)) for edu in pending_edus: sent_edus_by_type.labels(edu.edu_type).inc() # Remove the acknowledged device messages from the database # Only bother if we actually sent some device messages if device_message_edus: yield self._store.delete_device_msgs_for_remote( self._destination, device_stream_id) logger.info("Marking as sent %r %r", self._destination, dev_list_id) yield self._store.mark_as_sent_devices_by_remote( self._destination, dev_list_id) self._last_device_stream_id = device_stream_id self._last_device_list_stream_id = dev_list_id else: break except NotRetryingDestination as e: logger.debug( "TX [%s] not ready for retry yet (next retry at %s) - " "dropping transaction for now", self._destination, datetime.datetime.fromtimestamp( (e.retry_last_ts + e.retry_interval) / 1000.0), ) except FederationDeniedError as e: logger.info(e) except HttpResponseException as e: logger.warning( "TX [%s] Received %d response to transaction: %s", self._destination, e.code, e, ) except RequestSendFailed as e: logger.warning("TX [%s] Failed to send transaction: %s", self._destination, e) for p, _ in pending_pdus: logger.info("Failed to send event %s to %s", p.event_id, self._destination) except Exception: logger.exception( "TX [%s] Failed to send transaction", self._destination, ) for p, _ in pending_pdus: logger.info("Failed to send event %s to %s", p.event_id, self._destination) finally: # We want to be *very* sure we clear this after we stop processing self.transmission_loop_running = False
def process_replication(self, result): # The federation stream contains things that we want to send out, e.g. # presence, typing, etc. fed_stream = result.get("federation") if fed_stream: latest_id = int(fed_stream["position"]) # The federation stream containis a bunch of different types of # rows that need to be handled differently. We parse the rows, put # them into the appropriate collection and then send them off. presence_to_send = {} keyed_edus = {} edus = {} failures = {} device_destinations = set() # Parse the rows in the stream for row in fed_stream["rows"]: position, typ, content_js = row content = json.loads(content_js) if typ == send_queue.PRESENCE_TYPE: destination = content["destination"] state = UserPresenceState.from_dict(content["state"]) presence_to_send.setdefault(destination, []).append(state) elif typ == send_queue.KEYED_EDU_TYPE: key = content["key"] edu = Edu(**content["edu"]) keyed_edus.setdefault(edu.destination, {})[(edu.destination, tuple(key))] = edu elif typ == send_queue.EDU_TYPE: edu = Edu(**content) edus.setdefault(edu.destination, []).append(edu) elif typ == send_queue.FAILURE_TYPE: destination = content["destination"] failure = content["failure"] failures.setdefault(destination, []).append(failure) elif typ == send_queue.DEVICE_MESSAGE_TYPE: device_destinations.add(content["destination"]) else: raise Exception("Unrecognised federation type: %r", typ) # We've finished collecting, send everything off for destination, states in presence_to_send.items(): self.federation_sender.send_presence(destination, states) for destination, edu_map in keyed_edus.items(): for key, edu in edu_map.items(): self.federation_sender.send_edu( edu.destination, edu.edu_type, edu.content, key=key, ) for destination, edu_list in edus.items(): for edu in edu_list: self.federation_sender.send_edu( edu.destination, edu.edu_type, edu.content, key=None, ) for destination, failure_list in failures.items(): for failure in failure_list: self.federation_sender.send_failure(destination, failure) for destination in device_destinations: self.federation_sender.send_device_messages(destination) # Record where we are in the stream. yield self.store.update_federation_out_pos("federation", latest_id) # We also need to poke the federation sender when new events happen event_stream = result.get("events") if event_stream: latest_pos = event_stream["position"] self.federation_sender.notify_new_events(latest_pos)
def _handle_incoming_transaction(self, origin, transaction, request_time): """ Process an incoming transaction and return the HTTP response Args: origin (unicode): the server making the request transaction (Transaction): incoming transaction request_time (int): timestamp that the HTTP request arrived at Returns: Deferred[(int, object)]: http response code and body """ response = yield self.transaction_actions.have_responded( origin, transaction) if response: logger.debug( "[%s] We've already responded to this request", transaction.transaction_id, ) defer.returnValue(response) return logger.debug("[%s] Transaction is new", transaction.transaction_id) # Reject if PDU count > 50 and EDU count > 100 if len(transaction.pdus) > 50 or (hasattr(transaction, "edus") and len(transaction.edus) > 100): logger.info( "Transaction PDU or EDU count too large. Returning 400") response = {} yield self.transaction_actions.set_response( origin, transaction, 400, response) defer.returnValue((400, response)) received_pdus_counter.inc(len(transaction.pdus)) origin_host, _ = parse_server_name(origin) pdus_by_room = {} for p in transaction.pdus: if "unsigned" in p: unsigned = p["unsigned"] if "age" in unsigned: p["age"] = unsigned["age"] if "age" in p: p["age_ts"] = request_time - int(p["age"]) del p["age"] # We try and pull out an event ID so that if later checks fail we # can log something sensible. We don't mandate an event ID here in # case future event formats get rid of the key. possible_event_id = p.get("event_id", "<Unknown>") # Now we get the room ID so that we can check that we know the # version of the room. room_id = p.get("room_id") if not room_id: logger.info( "Ignoring PDU as does not have a room_id. Event ID: %s", possible_event_id, ) continue try: room_version = yield self.store.get_room_version(room_id) except NotFoundError: logger.info("Ignoring PDU for unknown room_id: %s", room_id) continue try: format_ver = room_version_to_event_format(room_version) except UnsupportedRoomVersionError: # this can happen if support for a given room version is withdrawn, # so that we still get events for said room. logger.info( "Ignoring PDU for room %s with unknown version %s", room_id, room_version, ) continue event = event_from_pdu_json(p, format_ver) pdus_by_room.setdefault(room_id, []).append(event) pdu_results = {} # we can process different rooms in parallel (which is useful if they # require callouts to other servers to fetch missing events), but # impose a limit to avoid going too crazy with ram/cpu. @defer.inlineCallbacks def process_pdus_for_room(room_id): logger.debug("Processing PDUs for %s", room_id) try: yield self.check_server_matches_acl(origin_host, room_id) except AuthError as e: logger.warn("Ignoring PDUs for room %s from banned server", room_id) for pdu in pdus_by_room[room_id]: event_id = pdu.event_id pdu_results[event_id] = e.error_dict() return for pdu in pdus_by_room[room_id]: event_id = pdu.event_id with nested_logging_context(event_id): try: yield self._handle_received_pdu(origin, pdu) pdu_results[event_id] = {} except FederationError as e: logger.warn("Error handling PDU %s: %s", event_id, e) pdu_results[event_id] = {"error": str(e)} except Exception as e: f = failure.Failure() pdu_results[event_id] = {"error": str(e)} logger.error( "Failed to handle PDU %s", event_id, exc_info=(f.type, f.value, f.getTracebackObject()), ) yield concurrently_execute(process_pdus_for_room, pdus_by_room.keys(), TRANSACTION_CONCURRENCY_LIMIT) if hasattr(transaction, "edus"): for edu in (Edu(**x) for x in transaction.edus): yield self.received_edu(origin, edu.edu_type, edu.content) response = {"pdus": pdu_results} logger.debug("Returning: %s", str(response)) yield self.transaction_actions.set_response(origin, transaction, 200, response) defer.returnValue((200, response))