def noop_event_update(event, data): # Check whether the update is actually updating fields. # We do this by cloning the event, updating the fields and # comparing them. This is less cumbersome than having to think # about the multiple values of the `when` field. e = Event() e.update(event) e.namespace = event.namespace for attr in Event.API_MODIFIABLE_FIELDS: if attr in data: setattr(e, attr, data[attr]) e1 = encode(event) e2 = encode(e) for attr in Event.API_MODIFIABLE_FIELDS: # We have to handle participants a bit differently because # it's a list which can be permuted. if attr == 'participants': continue event_value = e1.get(attr) e_value = e2.get(attr) if event_value != e_value: return False e_participants = {p['email']: p for p in e.participants} event_participants = {p['email']: p for p in event.participants} if len(e_participants.keys()) != len(event_participants.keys()): return False for email in e_participants: if email not in event_participants: return False p1 = e_participants[email] p2 = event_participants[email] p1_status = p1.get('status') p2_status = p2.get('status') if p1_status != p2_status: return False p1_comment = p1.get('comment') p2_comment = p2.get('comment') if p1_comment != p2_comment: return False return True
def index_messages(namespace_id, namespace_public_id, created_before=None): """ Index the messages of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if created_before is not None: query = query.filter(Message.created_at <= created_before) query = query.options(joinedload(Message.parts). load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) index_obj = _process_attributes(encoded_obj) encoded.append(('index', index_obj)) log.info('Going to index messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def send_draft_copy(account, draft, custom_body, recipient): """ Sends a copy of this draft to the recipient, using the specified body rather that the one on the draft object, and not marking the draft as sent. Used within multi-send to send messages to individual recipients with customized bodies. """ # Create the response to send on success by serlializing the draft. After # serializing, we replace the new custom body (which the recipient will get # and which should be returned in this response) in place of the existing # body (which we still need to retain in the draft for when it's saved to # the sent folder). response_on_success = encode(draft) response_on_success["body"] = custom_body response_on_success = APIEncoder().jsonify(response_on_success) # Now send the draft to the specified recipient. The send_custom method # will write the custom body into the message in place of the one in the # draft. try: sendmail_client = get_sendmail_client(account) sendmail_client.send_custom(draft, custom_body, [recipient]) except SendMailException as exc: kwargs = {} if exc.failures: kwargs["failures"] = exc.failures if exc.server_error: kwargs["server_error"] = exc.server_error return err(exc.http_code, exc.message, **kwargs) return response_on_success
def index_messages(namespace, updated_since=None): """ Index the messages of a namespace. """ namespace_id, namespace_public_id = namespace if updated_since is not None: updated_since = dateutil.parser.parse(updated_since) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace.id) if updated_since is not None: query = query.filter(Message.updated_at > updated_since) query = query.options(joinedload(Message.parts). load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(encoded_obj) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def send_draft_copy(account, draft, custom_body, recipient): """ Sends a copy of this draft to the recipient, using the specified body rather that the one on the draft object, and not marking the draft as sent. Used within multi-send to send messages to individual recipients with customized bodies. """ # Create the response to send on success by serlializing the draft. After # serializing, we replace the new custom body (which the recipient will get # and which should be returned in this response) in place of the existing # body (which we still need to retain in the draft for when it's saved to # the sent folder). response_on_success = encode(draft) response_on_success['body'] = custom_body response_on_success = APIEncoder().jsonify(response_on_success) # Now send the draft to the specified recipient. The send_custom method # will write the custom body into the message in place of the one in the # draft. try: sendmail_client = get_sendmail_client(account) sendmail_client.send_custom(draft, custom_body, [recipient]) except SendMailException as exc: kwargs = {} if exc.failures: kwargs['failures'] = exc.failures if exc.server_error: kwargs['server_error'] = exc.server_error return err(exc.http_code, exc.message, **kwargs) return response_on_success
def index_messages(namespace, updated_since=None): """ Index the messages of a namespace. """ namespace_id, namespace_public_id = namespace if updated_since is not None: updated_since = dateutil.parser.parse(updated_since) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if updated_since is not None: query = query.filter(Message.updated_at > updated_since) query = query.options( joinedload(Message.parts).load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', encoded_obj)) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def create_revision(obj, session, revision_type): from inbox.api.kellogs import encode assert revision_type in ('insert', 'update', 'delete') if (not isinstance(obj, HasRevisions) or obj.should_suppress_transaction_creation): return if revision_type == 'update' and not obj.has_versioned_changes(): return revision = Transaction(command=revision_type, record_id=obj.id, object_type=obj.API_OBJECT_NAME, object_public_id=obj.public_id, namespace_id=obj.namespace.id) if revision_type != 'delete': revision.snapshot = encode(obj) session.add(revision)
def take_snapshot(self, obj): """Record the API's representation of `obj` at the time this transaction is generated, as well as any other properties we want to have available in the transaction log. Used for delta syncing and the ping API.""" from inbox.api.kellogs import encode self.public_snapshot = encode(obj) from inbox.models.message import Message if isinstance(obj, Message): # hack self.private_snapshot = { 'recentdate': obj.thread.recentdate, 'subjectdate': obj.thread.subjectdate, 'filenames': [part.block.filename for part in obj.parts if part.is_attachment]}
def index_threads(namespace_id, namespace_public_id, created_before=None): """ Index the threads of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Thread).filter( Thread.namespace_id == namespace_id) if created_before is not None: query = query.filter(Thread.created_at <= created_before) query = query.options( subqueryload(Thread.messages).load_only('public_id', 'is_draft', 'from_addr', 'to_addr', 'cc_addr', 'bcc_addr'), subqueryload('tagitems').joinedload('tag').load_only( 'public_id', 'name')) encoded = [] for obj in safer_yield_per(query, Thread.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.threads.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.threads.bulk_index(encoded) log.info('Indexed threads', namespace_id=namespace_id, namespace_public_id=namespace_public_id, thread_count=indexed_count) return indexed_count
def index_threads(namespace_id, namespace_public_id, created_before=None): """ Index the threads of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Thread).filter( Thread.namespace_id == namespace_id) if created_before is not None: query = query.filter(Thread.created_at <= created_before) query = query.options( subqueryload(Thread.messages). load_only('public_id', 'is_draft', 'from_addr', 'to_addr', 'cc_addr', 'bcc_addr'), subqueryload('tagitems').joinedload('tag'). load_only('public_id', 'name')) encoded = [] for obj in safer_yield_per(query, Thread.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.threads.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.threads.bulk_index(encoded) log.info('Indexed threads', namespace_id=namespace_id, namespace_public_id=namespace_public_id, thread_count=indexed_count) return indexed_count
def index(self, transactions, db_session): """ Translate database operations to Elasticsearch index operations and perform them. """ namespace_map = defaultdict(lambda: defaultdict(list)) for trx in transactions: namespace_id = trx.namespace.public_id type_ = trx.object_type if trx.command == 'delete': operation = 'delete' api_repr = {'id': trx.object_public_id} else: operation = 'index' object_cls = transaction_objects()[trx.object_type] obj = db_session.query(object_cls).get(trx.record_id) if obj is None: continue api_repr = encode(obj, namespace_public_id=namespace_id) namespace_map[namespace_id][type_].append((operation, api_repr)) self.log.info('namespaces to index count', count=len(namespace_map)) for namespace_id in namespace_map: engine = NamespaceSearchEngine(namespace_id, create_index=True) messages = namespace_map[namespace_id]['message'] message_count = engine.messages.bulk_index(messages) if messages \ else 0 threads = namespace_map[namespace_id]['thread'] thread_count = engine.threads.bulk_index(threads) if threads \ else 0 self.log.info('per-namespace index counts', namespace_id=namespace_id, message_count=message_count, thread_count=thread_count)
def index_messages(namespace_id, namespace_public_id, created_before=None): """ Index the messages of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if created_before is not None: query = query.filter(Message.created_at <= created_before) query = query.options( joinedload(Message.parts).load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.messages.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def index_namespace(namespace_public_id, updated_since=None): """ Create an Elasticsearch index for a namespace and index its threads and messages. """ if updated_since is not None: updated_since = dateutil.parser.parse(updated_since) indexed_count = 0 for obj_type in (Message, Thread): with session_scope() as db_session: namespace = db_session.query(Namespace).filter( Namespace.public_id == namespace_public_id).one() search_engine = NamespaceSearchEngine(namespace_public_id) # TODO: paginate the query so that we don't run out of memory on # life-sized accounts. objects = db_session.query(obj_type).filter( obj_type.namespace_id == namespace.id) if updated_since is not None: objects = objects.filter(obj_type.updated_at > updated_since) for obj in objects.all(): encoded_obj = encode( obj, namespace_public_id=namespace_public_id, format_address_fn=es_format_address_list, format_tags_fn=es_format_tags_list) if obj_type == Message: search_engine.messages.index(encoded_obj) elif obj_type == Thread: search_engine.threads.index(encoded_obj) indexed_count += 1 return indexed_count
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, expand=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. exclude_types.add('account') if exclude_folders is True: exclude_types.update(('folder', 'label')) # End backwards-compatibility shim. last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id) }.values() # Load all referenced not-deleted objects. ids_to_query = [ trx.record_id for trx in latest_trxs if trx.command != 'delete' ] object_cls = transaction_objects()[obj_type] query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode(obj, namespace_public_id=namespace.public_id, expand=expand) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def default(self, data): serialized = encode(data) if serialized is not None: return serialized raise TypeError
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, exclude_metadata=True, exclude_account=True, expand=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. if exclude_folders is True: exclude_types.update(('folder', 'label')) if exclude_account is True: exclude_types.add('account') # End backwards-compatibility shim. # Metadata is excluded by default, and can only be included by setting the # exclude_metadata flag to False. If listed in include_types, remove it. if exclude_metadata is True: exclude_types.add('metadata') if include_types is not None and 'metadata' in include_types: include_types.remove('metadata') last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id)}.values() # Load all referenced not-deleted objects. ids_to_query = [trx.record_id for trx in latest_trxs if trx.command != 'delete'] object_cls = transaction_objects()[obj_type] if object_cls == Account: # The base query for Account queries the /Namespace/ table # since the API-returned "`account`" is a `namespace` # under-the-hood. query = db_session.query(Namespace).join(Account).filter( Account.id.in_(ids_to_query), Namespace.id == namespace.id) # Key by /namespace.account_id/ -- # namespace.id may not be equal to account.id # and trx.record_id == account.id for `account` trxs. objects = {obj.account_id: obj for obj in query} else: query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode( obj, namespace_public_id=namespace.public_id, expand=expand) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None, exclude_folders=True, exclude_metadata=True, exclude_account=True, expand=False, is_n1=False): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: new_session database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ exclude_types = set(exclude_types) if exclude_types else set() # Begin backwards-compatibility shim -- suppress new object types for now, # because clients may not be able to deal with them. if exclude_folders is True: exclude_types.update(('folder', 'label')) if exclude_account is True: exclude_types.add('account') # End backwards-compatibility shim. # Metadata is excluded by default, and can only be included by setting the # exclude_metadata flag to False. If listed in include_types, remove it. if exclude_metadata is True: exclude_types.add('metadata') if include_types is not None and 'metadata' in include_types: include_types.remove('metadata') last_trx = _get_last_trx_id_for_namespace(namespace.id, db_session) if last_trx == pointer: return ([], pointer) while True: transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id) if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id) }.values() # Load all referenced not-deleted objects. ids_to_query = [ trx.record_id for trx in latest_trxs if trx.command != 'delete' ] object_cls = transaction_objects()[obj_type] if object_cls == Account: # The base query for Account queries the /Namespace/ table # since the API-returned "`account`" is a `namespace` # under-the-hood. query = db_session.query(Namespace).join(Account).filter( Account.id.in_(ids_to_query), Namespace.id == namespace.id) # Key by /namespace.account_id/ -- # namespace.id may not be equal to account.id # and trx.record_id == account.id for `account` trxs. objects = {obj.account_id: obj for obj in query} else: query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls == Thread: query = query.options(*Thread.api_loading_options(expand)) elif object_cls == Message: query = query.options(*Message.api_loading_options(expand)) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode(obj, namespace_public_id=namespace.public_id, expand=expand, is_n1=is_n1) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id
def format_transactions_after_pointer(namespace, pointer, db_session, result_limit, exclude_types=None, include_types=None): """ Return a pair (deltas, new_pointer), where deltas is a list of change events, represented as dictionaries: { "object": <API object type, e.g. "thread">, "event": <"create", "modify", or "delete>, "attributes": <API representation of the object for insert/update events> "cursor": <public_id of the transaction> } and new_pointer is the integer id of the last included transaction Arguments --------- namespace_id: int Id of the namespace for which to get changes. pointer: int Process transactions starting after this id. db_session: InboxSession database session result_limit: int Maximum number of results to return. (Because we may roll up multiple changes to the same object, fewer results can be returned.) format_transaction_fn: function pointer Function that defines how to format the transactions. exclude_types: list, optional If given, don't include transactions for these types of objects. """ while True: # deleted_at condition included to allow this query to be satisfied via # the legacy index on (namespace_id, deleted_at) for performance. # Also need to explicitly specify the index hint because the query # planner is dumb as nails and otherwise would make this super slow for # some values of namespace_id and pointer. # TODO(emfree): Remove this hack and ensure that the right index (on # namespace_id only) exists. transactions = db_session.query(Transaction). \ filter( Transaction.id > pointer, Transaction.namespace_id == namespace.id, Transaction.deleted_at.is_(None)). \ with_hint(Transaction, 'USE INDEX (namespace_id_deleted_at)') if exclude_types is not None: transactions = transactions.filter( ~Transaction.object_type.in_(exclude_types)) if include_types is not None: transactions = transactions.filter( Transaction.object_type.in_(include_types)) transactions = transactions. \ order_by(asc(Transaction.id)).limit(result_limit).all() if not transactions: return ([], pointer) results = [] # Group deltas by object type. trxs_by_obj_type = collections.defaultdict(list) for trx in transactions: trxs_by_obj_type[trx.object_type].append(trx) for obj_type, trxs in trxs_by_obj_type.items(): # Build a dictionary mapping pairs (record_id, command) to # transaction. If successive modifies for a given record id appear # in the list of transactions, this will only keep the latest # one (which is what we want). latest_trxs = {(trx.record_id, trx.command): trx for trx in sorted(trxs, key=lambda t: t.id)}.values() # Load all referenced not-deleted objects. ids_to_query = [trx.record_id for trx in latest_trxs if trx.command != 'delete'] object_cls = transaction_objects()[obj_type] query = db_session.query(object_cls).filter( object_cls.id.in_(ids_to_query), object_cls.namespace_id == namespace.id) if object_cls in QUERY_OPTIONS: query = query.options(*QUERY_OPTIONS[object_cls]) objects = {obj.id: obj for obj in query} for trx in latest_trxs: delta = { 'object': trx.object_type, 'event': EVENT_NAME_FOR_COMMAND[trx.command], 'id': trx.object_public_id, 'cursor': trx.public_id } if trx.command != 'delete': obj = objects.get(trx.record_id) if obj is None: continue repr_ = encode( obj, namespace_public_id=namespace.public_id) delta['attributes'] = repr_ results.append((trx.id, delta)) if results: # Sort deltas by id of the underlying transactions. results.sort() deltas = [d for _, d in results] return (deltas, results[-1][0]) else: # It's possible that none of the referenced objects exist any more, # meaning the result list is empty. In that case, keep traversing # the log until we get actual results or reach the end. pointer = transactions[-1].id