def index_messages(namespace_id, namespace_public_id, created_before=None): """ Index the messages of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if created_before is not None: query = query.filter(Message.created_at <= created_before) query = query.options(joinedload(Message.parts). load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) index_obj = _process_attributes(encoded_obj) encoded.append(('index', index_obj)) log.info('Going to index messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def fetch_corresponding_thread(db_session, namespace_id, message): """fetch a thread matching the corresponding message. Returns None if there's no matching thread.""" # FIXME: for performance reasons, we make the assumption that a reply # to a message always has a similar subject. This is only # right 95% of the time. clean_subject = cleanup_subject(message.subject) threads = db_session.query(Thread).filter( Thread.namespace_id == namespace_id, Thread._cleaned_subject == clean_subject). \ order_by(desc(Thread.id)) for thread in safer_yield_per(threads, Thread.id, 0, 100): for match in thread.messages: # A lot of people BCC some address when sending mass # emails so ignore BCC. match_bcc = match.bcc_addr if match.bcc_addr else [] message_bcc = message.bcc_addr if message.bcc_addr else [] match_emails = [ t[1] for t in match.participants if t not in match_bcc ] message_emails = [ t[1] for t in message.participants if t not in message_bcc ] # A conversation takes place between two or more persons. # Are there more than two participants in common in this # thread? If yes, it's probably a related thread. match_participants_set = set(match_emails) message_participants_set = set(message_emails) if len(match_participants_set & message_participants_set) >= 2: # No need to loop through the rest of the messages # in the thread if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread # handle the case where someone is self-sending an email. if not message.from_addr or not message.to_addr: return match_from = [t[1] for t in match.from_addr] match_to = [t[1] for t in match.from_addr] message_from = [t[1] for t in message.from_addr] message_to = [t[1] for t in message.to_addr] if (len(message_to) == 1 and message_from == message_to and match_from == match_to and message_to == match_from): # Check that we're not over max thread length in this case # No need to loop through the rest of the messages # in the thread. if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread return
def index_messages(namespace, updated_since=None): """ Index the messages of a namespace. """ namespace_id, namespace_public_id = namespace if updated_since is not None: updated_since = dateutil.parser.parse(updated_since) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if updated_since is not None: query = query.filter(Message.updated_at > updated_since) query = query.options( joinedload(Message.parts).load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', encoded_obj)) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def index_messages(namespace, updated_since=None): """ Index the messages of a namespace. """ namespace_id, namespace_public_id = namespace if updated_since is not None: updated_since = dateutil.parser.parse(updated_since) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace.id) if updated_since is not None: query = query.filter(Message.updated_at > updated_since) query = query.options(joinedload(Message.parts). load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): encoded_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(encoded_obj) indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def _process_log(self): """Scan the transaction log `self.chunk_size` entries at a time, publishing matching events to registered hooks.""" with session_scope() as db_session: self.log.info('Scanning tx log from id: {}'. format(self.minimum_id)) unprocessed_txn_count = db_session.query( func.count(Transaction.id)).filter( Transaction.table_name == 'message', Transaction.id > self.minimum_id).scalar() if unprocessed_txn_count: self.log.debug('Total of {0} transactions to process'. format(unprocessed_txn_count)) max_tx_id, = db_session.query(func.max(Transaction.id)).one() if max_tx_id is None: max_tx_id = 0 query = db_session.query(Transaction). \ filter(Transaction.table_name == 'message', Transaction.command == 'insert'). \ order_by(asc(Transaction.id)) for transaction in safer_yield_per(query, Transaction.id, self.minimum_id, self.chunk_size): namespace_id = transaction.namespace_id for worker in self.workers[namespace_id]: if worker.match(transaction): worker.enqueue(EventData(transaction)) self.minimum_id = transaction.id + 1 self.log.debug('Processed tx. setting min id to {0}'. format(self.minimum_id))
def index_namespace(namespace_id): """ Backfill function to index a namespace from current db data Not used for incremental indexing. """ if not search_service_url or not doc_service_url: raise Exception("CloudSearch not configured; cannot index") else: search_client = ContactSearchClient(namespace_id) doc_service = get_doc_service() # Look up previously indexed data so we can delete any records which # have disappeared. previous_records = search_client.fetch_all_matching_ids() log.info("previous records", total=len(previous_records), ids=previous_records) indexed = 0 current_records = set() docs = [] with session_scope(namespace_id) as db_session: query = (db_session.query(Contact).options( joinedload("phone_numbers")).filter_by( namespace_id=namespace_id)) for contact in safer_yield_per(query, Contact.id, 0, 1000): log.info("indexing", contact_id=contact.id) current_records.add(long(contact.id)) contact_object = cloudsearch_contact_repr(contact) docs.append({ "type": "add", "id": contact.id, "fields": contact_object }) if len(docs) > DOC_UPLOAD_CHUNK_SIZE: doc_service.upload_documents( documents=json.dumps(docs), contentType="application/json") indexed += len(docs) docs = [] indexed += len(docs) # Deletes are small, so we can stick 'em on this batch. deleted_records = set(previous_records).difference(current_records) for id_ in deleted_records: log.info("deleting", contact_id=id_) docs.append({"type": "delete", "id": id_}) if docs: doc_service.upload_documents(documents=json.dumps(docs), contentType="application/json") log.info( "namespace index complete", namespace_id=namespace_id, total_contacts_indexed=indexed, total_contacts_deleted=len(deleted_records), )
def _process_log(self): # TODO(emfree) handle the case that message/thread objects may have # been deleted in the interim. with session_scope() as db_session: query = db_session.query(ActionLog).filter( ActionLog.status == 'pending', ActionLog.retries < ACTION_MAX_NR_OF_RETRIES) if self._scheduled_actions: query = query.filter( ~ActionLog.id.in_(self._scheduled_actions)) query = query.order_by(asc(ActionLog.id)) for log_entry in safer_yield_per(query, ActionLog.id, 0, self.chunk_size): action_function = ACTION_FUNCTION_MAP[log_entry.action] namespace = db_session.query(Namespace). \ get(log_entry.namespace_id) # Only actions on accounts associated with this sync-engine if namespace.account.sync_host != platform.node(): continue self._scheduled_actions.add(log_entry.id) self.log.info('delegating action', action_id=log_entry.id, msg=log_entry.action) semaphore = self.semaphore_map[(namespace.account_id, log_entry.action)] gevent.spawn(syncback_worker, semaphore, action_function, log_entry.id, log_entry.record_id, namespace.account_id, syncback_service=self, extra_args=log_entry.extra_args)
def _process_log(self): # TODO(emfree) handle the case that message/thread objects may have # been deleted in the interim. with session_scope() as db_session: query = db_session.query(ActionLog).filter(~ActionLog.executed) if self._scheduled_actions: query = query.filter( ~ActionLog.id.in_(self._scheduled_actions)) query = query.order_by(asc(ActionLog.id)) for log_entry in safer_yield_per(query, ActionLog.id, 0, self.chunk_size): action_function = ACTION_FUNCTION_MAP[log_entry.action] namespace = db_session.query(Namespace). \ get(log_entry.namespace_id) # Only actions on accounts associated with this sync-engine if namespace.account.sync_host != platform.node(): continue self._scheduled_actions.add(log_entry.id) self.log.info('delegating action', action_id=log_entry.id, msg=log_entry.action) semaphore = self.semaphore_map[(namespace.account_id, log_entry.action)] gevent.spawn(syncback_worker, semaphore, action_function, log_entry.id, log_entry.record_id, namespace.account_id, syncback_service=self, extra_args=log_entry.extra_args)
def _process_log(self): """Scan the transaction log `self.chunk_size` entries at a time, publishing matching events to registered hooks.""" with session_scope() as db_session: self.log.info('Scanning tx log from id: {}'.format( self.minimum_id)) unprocessed_txn_count = db_session.query( func.count(Transaction.id)).filter( Transaction.table_name == 'message', Transaction.id > self.minimum_id).scalar() if unprocessed_txn_count: self.log.debug('Total of {0} transactions to process'.format( unprocessed_txn_count)) max_tx_id, = db_session.query(func.max(Transaction.id)).one() if max_tx_id is None: max_tx_id = 0 query = db_session.query(Transaction). \ filter(Transaction.table_name == 'message', Transaction.command == 'insert'). \ order_by(asc(Transaction.id)) for transaction in safer_yield_per(query, Transaction.id, self.minimum_id, self.chunk_size): namespace_id = transaction.namespace_id for worker in self.workers[namespace_id]: if worker.match(transaction): worker.enqueue(EventData(transaction)) self.minimum_id = transaction.id + 1 self.log.debug('Processed tx. setting min id to {0}'.format( self.minimum_id))
def fetch_corresponding_thread(db_session, namespace_id, message): """fetch a thread matching the corresponding message. Returns None if there's no matching thread.""" # FIXME: for performance reasons, we make the assumption that a reply # to a message always has a similar subject. This is only # right 95% of the time. clean_subject = cleanup_subject(message.subject) threads = db_session.query(Thread).filter( Thread.namespace_id == namespace_id, Thread._cleaned_subject == clean_subject). \ order_by(desc(Thread.id)) for thread in safer_yield_per(threads, Thread.id, 0, 100): for match in thread.messages: # A lot of people BCC some address when sending mass # emails so ignore BCC. match_bcc = match.bcc_addr if match.bcc_addr else [] message_bcc = message.bcc_addr if message.bcc_addr else [] match_emails = [t[1] for t in match.participants if t not in match_bcc] message_emails = [t[1] for t in message.participants if t not in message_bcc] # A conversation takes place between two or more persons. # Are there more than two participants in common in this # thread? If yes, it's probably a related thread. match_participants_set = set(match_emails) message_participants_set = set(message_emails) if len(match_participants_set & message_participants_set) >= 2: # No need to loop through the rest of the messages # in the thread if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread # handle the case where someone is self-sending an email. if not message.from_addr or not message.to_addr: return match_from = [t[1] for t in match.from_addr] match_to = [t[1] for t in match.from_addr] message_from = [t[1] for t in message.from_addr] message_to = [t[1] for t in message.to_addr] if (len(message_to) == 1 and message_from == message_to and match_from == match_to and message_to == match_from): # Check that we're not over max thread length in this case # No need to loop through the rest of the messages # in the thread. if len(thread.messages) >= MAX_THREAD_LENGTH: break else: return match.thread return
def index_namespace(namespace_id): """ Backfill function to index a namespace from current db data Not used for incremental indexing. """ if not search_service_url or not doc_service_url: raise Exception('CloudSearch not configured; cannot index') else: search_client = ContactSearchClient(namespace_id) doc_service = get_doc_service() # Look up previously indexed data so we can delete any records which # have disappeared. previous_records = search_client.fetch_all_matching_ids() log.info("previous records", total=len(previous_records), ids=previous_records) indexed = 0 current_records = set() docs = [] with session_scope(namespace_id) as db_session: query = db_session.query(Contact).options( joinedload("phone_numbers")).filter_by( namespace_id=namespace_id) for contact in safer_yield_per(query, Contact.id, 0, 1000): log.info("indexing", contact_id=contact.id) current_records.add(long(contact.id)) contact_object = cloudsearch_contact_repr(contact) docs.append({'type': 'add', 'id': contact.id, 'fields': contact_object}) if len(docs) > DOC_UPLOAD_CHUNK_SIZE: doc_service.upload_documents( documents=json.dumps(docs), contentType='application/json') indexed += len(docs) docs = [] indexed += len(docs) # Deletes are small, so we can stick 'em on this batch. deleted_records = set(previous_records).difference(current_records) for id_ in deleted_records: log.info("deleting", contact_id=id_) docs.append({'type': 'delete', 'id': id_}) if docs: doc_service.upload_documents( documents=json.dumps(docs), contentType='application/json') log.info("namespace index complete", namespace_id=namespace_id, total_contacts_indexed=indexed, total_contacts_deleted=len(deleted_records))
def index_namespace(namespace_id): if not CLOUDSEARCH_DOMAIN: raise Exception('CloudSearch not configured; cannot index') else: search_client = ContactSearchClient(namespace_id) doc_service = get_doc_service() # Look up previously indexed data so we can delete any records which # have disappeared. # previous_records = search_client.fetch_all_matching_ids() log.info("previous records", total=len(previous_records), ids=previous_records) indexed = 0 current_records = set() docs = [] with session_scope() as db_session: query = db_session.query(Contact).options( joinedload("phone_numbers")).filter_by( namespace_id=namespace_id) for contact in safer_yield_per(query, Contact.id, 0, 1000): log.info("indexing", contact_id=contact.id) current_records.add(long(contact.id)) contact_object = cloudsearch_contact_repr(contact) docs.append({ 'type': 'add', 'id': contact.id, 'fields': contact_object }) if len(docs) > DOC_UPLOAD_CHUNK_SIZE: doc_service.upload_documents( documents=json.dumps(docs), contentType='application/json') indexed += len(docs) docs = [] indexed += len(docs) # Deletes are small, so we can stick 'em on this batch. deleted_records = set(previous_records).difference(current_records) for id_ in deleted_records: log.info("deleting", contact_id=id_) docs.append({'type': 'delete', 'id': id_}) if docs: doc_service.upload_documents(documents=json.dumps(docs), contentType='application/json') log.info("namespace index complete", total_contacts_indexed=indexed, total_contacts_deleted=len(deleted_records))
def index_threads(namespace_id, namespace_public_id, created_before=None): """ Index the threads of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Thread).filter( Thread.namespace_id == namespace_id) if created_before is not None: query = query.filter(Thread.created_at <= created_before) query = query.options( subqueryload(Thread.messages).load_only('public_id', 'is_draft', 'from_addr', 'to_addr', 'cc_addr', 'bcc_addr'), subqueryload('tagitems').joinedload('tag').load_only( 'public_id', 'name')) encoded = [] for obj in safer_yield_per(query, Thread.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.threads.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.threads.bulk_index(encoded) log.info('Indexed threads', namespace_id=namespace_id, namespace_public_id=namespace_public_id, thread_count=indexed_count) return indexed_count
def upgrade(): from inbox.models.session import session_scope from inbox.models import Namespace, Tag, Thread from inbox.sqlalchemy_ext.util import safer_yield_per from sqlalchemy import func from sqlalchemy.orm import joinedload with session_scope() as db_session: # Create the attachment tag for ns in db_session.query(Namespace): Tag.create_canonical_tags(ns, db_session) thread_count, = db_session.query(func.count(Thread.id)).one() q = db_session.query(Thread).options(joinedload(Thread.messages)) processed_count = 0 for thr in safer_yield_per(q, Thread.id, 1, thread_count): if any(m.attachments for m in thr.messages): attachment_tag = thr.namespace.tags['attachment'] thr.apply_tag(attachment_tag) processed_count += 1 print processed_count
def index_threads(namespace_id, namespace_public_id, created_before=None): """ Index the threads of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Thread).filter( Thread.namespace_id == namespace_id) if created_before is not None: query = query.filter(Thread.created_at <= created_before) query = query.options( subqueryload(Thread.messages). load_only('public_id', 'is_draft', 'from_addr', 'to_addr', 'cc_addr', 'bcc_addr'), subqueryload('tagitems').joinedload('tag'). load_only('public_id', 'name')) encoded = [] for obj in safer_yield_per(query, Thread.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.threads.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.threads.bulk_index(encoded) log.info('Indexed threads', namespace_id=namespace_id, namespace_public_id=namespace_public_id, thread_count=indexed_count) return indexed_count
def _process_log(self): # TODO(emfree) handle the case that message/thread objects may have # been deleted in the interim. with session_scope() as db_session: query = db_session.query(ActionLog).filter(~ActionLog.executed) if self._scheduled_actions: query = query.filter( ~ActionLog.id.in_(self._scheduled_actions)) query = query.order_by(asc(ActionLog.id)) for log_entry in safer_yield_per(query, ActionLog.id, 0, self.chunk_size): action_function = ACTION_FUNCTION_MAP[log_entry.action] namespace = db_session.query(Namespace). \ get(log_entry.namespace_id) self._scheduled_actions.add(log_entry.id) worker = SyncbackWorker(action_function, log_entry.id, log_entry.record_id, namespace.account_id, syncback_service=self) self.log.info('delegating action', action_id=log_entry.id) self.worker_pool.start(worker)
def index_messages(namespace_id, namespace_public_id, created_before=None): """ Index the messages of a namespace. """ if created_before is not None: created_before = dateutil.parser.parse(created_before) indexed_count = 0 search_engine = NamespaceSearchEngine(namespace_public_id, create_index=True) with session_scope() as db_session: query = db_session.query(Message).filter( Message.namespace_id == namespace_id) if created_before is not None: query = query.filter(Message.created_at <= created_before) query = query.options( joinedload(Message.parts).load_only('content_disposition')) encoded = [] for obj in safer_yield_per(query, Message.id, 0, CHUNK_SIZE): if len(encoded) >= INDEX_CHUNK_SIZE: indexed_count += search_engine.messages.bulk_index(encoded) encoded = [] index_obj = encode(obj, namespace_public_id=namespace_public_id) encoded.append(('index', index_obj)) if encoded: indexed_count += search_engine.messages.bulk_index(encoded) log.info('Indexed messages', namespace_id=namespace_id, namespace_public_id=namespace_public_id, message_count=indexed_count) return indexed_count
def get_entries_from_public_id(namespace_id, cursor_start, db_session, result_limit): """Returns up to result_limit processed transaction log entries for the given namespace_id. Begins processing the log after the transaction with public_id equal to the cursor_start parameter. Arguments --------- namespace_id: int cursor_start: string The public_id of the transaction log entry after which to begin processing. Normally this should be the return value of a previous call to get_public_id_from_ts, or the value of 'cursor_end' from a previous call to this function. db_session: InboxSession result_limit: int The maximum number of deltas to return. Returns ------- Dictionary with keys: - 'cursor_start' - 'deltas': list of serialized add/modify/delete deltas - (optional) 'cursor_end': the public_id of the last transaction log entry in the returned deltas, if available. This value can be passed as cursor_start in a subsequent call to this function to get the next page of results. Raises ------ ValueError If cursor_start is invalid. """ try: # Check that cursor_start can be a public id, and interpret the special # stamp value '0'. int_value = int(cursor_start, 36) if not int_value: internal_start_id = 0 else: internal_start_id, = db_session.query(Transaction.id). \ filter(Transaction.public_id == cursor_start, Transaction.namespace_id == namespace_id).one() except (ValueError, NoResultFound): raise ValueError( 'Invalid first_public_id parameter: {}'.format(cursor_start)) query = db_session.query(Transaction). \ order_by(asc(Transaction.id)). \ filter(Transaction.namespace_id == namespace_id) deltas = [] cursor_end = cursor_start for transaction in safer_yield_per(query, Transaction.id, internal_start_id + 1, result_limit): if should_publish_transaction(transaction, db_session): event = create_event(transaction) deltas.append(event) cursor_end = transaction.public_id if len(deltas) == result_limit: break result = { 'cursor_start': cursor_start, 'deltas': deltas, 'cursor_end': cursor_end } return result
def get_entries_from_public_id(namespace_id, events_start, db_session, result_limit): """Returns up to result_limit processed transaction log entries for the given namespace_id. Begins processing the log after the transaction with public_id equal to the events_start parameter. Arguments --------- namespace_id: int events_start: string The public_id of the transaction log entry after which to begin processing. Normally this should be the return value of a previous call to get_public_id_from_ts, or the value of 'events_end' from a previous call to this function. db_session: InboxSession result_limit: int The maximum number of events to return. Returns ------- Dictionary with keys: - 'events_start' - 'events': list of serialized add/modify/delete events - (optional) 'events_end': the public_id of the last transaction log entry in the returned events, if available. This value can be passed as events_start in a subsequent call to this function to get the next page of results. Raises ------ ValueError If events_start is invalid. """ try: # Check that events_start can be a public id, and interpret the special # stamp value '0'. int_value = int(events_start, 36) if not int_value: internal_start_id = 0 else: internal_start_id, = db_session.query(Transaction.id). \ filter(Transaction.public_id == events_start, Transaction.namespace_id == namespace_id).one() except (ValueError, NoResultFound): raise ValueError('Invalid first_public_id parameter: {}'. format(events_start)) query = db_session.query(Transaction). \ order_by(asc(Transaction.id)). \ filter(Transaction.namespace_id == namespace_id) events = [] events_end = events_start for transaction in safer_yield_per(query, Transaction.id, internal_start_id + 1, result_limit): if should_publish_transaction(transaction, db_session): event = create_event(transaction) events.append(event) events_end = transaction.public_id if len(events) == result_limit: break result = { 'events_start': events_start, 'events': events, 'events_end': events_end } return result