Example #1
0
def test_actions_are_claimed(purge_accounts_and_actions, patched_worker):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(0))
        schedule_test_action(db_session, account)

    with session_scope_by_shard_id(1) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(1))
        schedule_test_action(db_session, account)

    service = SyncbackService(cpu_id=1, total_cpus=2)
    service.workers = set()
    service._process_log()

    gevent.joinall(list(service.workers))

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status == 'pending' for a in q)

    with session_scope_by_shard_id(1) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status != 'pending' for a in q)
def test_actions_are_claimed(purge_accounts_and_actions, patched_task):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(0))
        schedule_test_action(db_session, account)

    with session_scope_by_shard_id(1) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(1))
        schedule_test_action(db_session, account)

    service = SyncbackService(syncback_id=0,
                              process_number=1,
                              total_processes=2,
                              num_workers=2)
    service._restart_workers()
    service._process_log()

    while not service.task_queue.empty():
        gevent.sleep(0.1)

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status == 'pending' for a in q)

    with session_scope_by_shard_id(1) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status != 'pending' for a in q)
def test_actions_are_claimed(purge_accounts_and_actions, patched_worker):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(db_session,
                                    email_address='{}@test.com'.format(0))
        schedule_test_action(db_session, account)

    with session_scope_by_shard_id(1) as db_session:
        account = add_generic_imap_account(db_session,
                                    email_address='{}@test.com'.format(1))
        schedule_test_action(db_session, account)

    service = SyncbackService(cpu_id=1, total_cpus=2)
    service.workers = set()
    service._process_log()

    gevent.joinall(list(service.workers))

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status == 'pending' for a in q)

    with session_scope_by_shard_id(1) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status != 'pending' for a in q)
Example #4
0
def test_actions_are_claimed(purge_accounts_and_actions, patched_task):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(0))
        schedule_test_action(db_session, account)

    with session_scope_by_shard_id(1) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='{}@test.com'.format(1))
        schedule_test_action(db_session, account)

    service = SyncbackService(
        syncback_id=0, process_number=1, total_processes=2, num_workers=2)
    service._restart_workers()
    service._process_log()

    while not service.task_queue.empty():
        gevent.sleep(0)

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status == 'pending' for a in q)

    with session_scope_by_shard_id(1) as db_session:
        q = db_session.query(ActionLog)
        assert q.count() == 1
        assert all(a.status != 'pending' for a in q)
Example #5
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        try:
            for key in engine_manager.engines:
                with session_scope_by_shard_id(key) as db_session:
                    pointer = db_session.query(ContactSearchIndexCursor).first()
                    if pointer:
                        self.transaction_pointers[key] = pointer.transaction_id
                    else:
                        # Never start from 0; if the service hasn't
                        # run before start from the latest
                        # transaction, with the expectation that a
                        # backfill will be run separately.
                        latest_transaction = (
                            db_session.query(Transaction).order_by(desc(Transaction.created_at)).first()
                        )
                        if latest_transaction:
                            self.transaction_pointers[key] = latest_transaction.id
                        else:
                            self.transaction_pointers[key] = 0

            self.log.info("Starting contact-search-index service", transaction_pointers=self.transaction_pointers)

            while True:
                for key in engine_manager.engines:
                    with session_scope_by_shard_id(key) as db_session:
                        transactions = (
                            db_session.query(Transaction)
                            .filter(
                                Transaction.id > self.transaction_pointers[key], Transaction.object_type == "contact"
                            )
                            .with_hint(Transaction, "USE INDEX (ix_transaction_table_name)")
                            .order_by(asc(Transaction.id))
                            .limit(self.chunk_size)
                            .options(joinedload(Transaction.namespace))
                            .all()
                        )

                        # index up to chunk_size transactions
                        should_sleep = False
                        if transactions:
                            self.index(transactions, db_session)
                            oldest_transaction = min(transactions, key=lambda t: t.created_at)
                            current_timestamp = datetime.utcnow()
                            latency = (current_timestamp - oldest_transaction.created_at).seconds
                            self._report_transactions_latency(latency)
                            new_pointer = transactions[-1].id
                            self.update_pointer(new_pointer, key, db_session)
                            db_session.commit()
                        else:
                            should_sleep = True
                    if should_sleep:
                        log.info("sleeping")
                        sleep(self.poll_interval)
        except Exception:
            log_uncaught_errors(log)
Example #6
0
    def _run(self):
        """
        Index into CloudSearch the contacts of all namespaces.

        """
        for key in engine_manager.engines:
            with session_scope_by_shard_id(key) as db_session:
                pointer = db_session.query(ContactSearchIndexCursor).first()
                if pointer:
                    self.transaction_pointers[key] = pointer.transaction_id
                else:
                    # Never start from 0; if the service hasn't run before
                    # start from the latest transaction, with the expectation
                    # that a backfill will be run separately.
                    latest_transaction = db_session.query(Transaction). \
                        order_by(desc(Transaction.created_at)).first()
                    if latest_transaction:
                        self.transaction_pointers[key] = latest_transaction.id
                    else:
                        self.transaction_pointers[key] = 0

        self.log.info('Starting contact-search-index service',
                      transaction_pointers=self.transaction_pointers)

        while True:
            for key in engine_manager.engines:
                with session_scope_by_shard_id(key) as db_session:
                    transactions = db_session.query(Transaction). \
                        filter(Transaction.id > self.transaction_pointers[key],
                               Transaction.object_type == 'contact'). \
                        with_hint(Transaction,
                                  "USE INDEX (ix_transaction_table_name)"). \
                        order_by(asc(Transaction.id)). \
                        limit(self.chunk_size). \
                        options(joinedload(Transaction.namespace)).all()

                    # index up to chunk_size transactions
                    should_sleep = False
                    if transactions:
                        self.index(transactions, db_session)
                        new_pointer = transactions[-1].id
                        self.update_pointer(new_pointer, key, db_session)
                        db_session.commit()
                    else:
                        should_sleep = True
                if should_sleep:
                    log.info('sleeping')
                    sleep(self.poll_interval)
Example #7
0
def purge_transactions(shard_id, days_ago=60, limit=1000, throttle=False,
                       dry_run=False):
    # Delete all items from the transaction table that are older than
    # `days_ago` days.
    if dry_run:
        offset = 0
        query = ("SELECT id FROM transaction where created_at < "
                 "DATE_SUB(now(), INTERVAL {} day) LIMIT {}".
                 format(days_ago, limit))
    else:
        query = ("DELETE FROM transaction where created_at < DATE_SUB(now(),"
                 " INTERVAL {} day) LIMIT {}".format(days_ago, limit))
    try:
        # delete from rows until there are no more rows affected
        rowcount = 1
        while rowcount > 0:
            while throttle and check_throttle():
                log.info("Throttling deletion")
                gevent.sleep(60)
            with session_scope_by_shard_id(shard_id, versioned=False) as \
                    db_session:
                if dry_run:
                    rowcount = db_session.execute("{} OFFSET {}".
                                                  format(query, offset)).\
                                                    rowcount
                    offset += rowcount
                else:
                    rowcount = db_session.execute(query).rowcount
            log.info("Deleted batch from transaction table", batch_size=limit,
                     rowcount=rowcount)
        log.info("Finished purging transaction table for shard",
                 shard_id=shard_id, date_delta=days_ago)
    except Exception as e:
        log.critical("Exception encountered during deletion", exception=e)
Example #8
0
def purge_transactions(shard_id, days_ago=60, limit=1000, throttle=False,
                       dry_run=False):
    # Delete all items from the transaction table that are older than
    # `days_ago` days.
    if dry_run:
        offset = 0
        query = ("SELECT id FROM transaction where created_at < "
                 "DATE_SUB(now(), INTERVAL {} day) LIMIT {}".
                 format(days_ago, limit))
    else:
        query = ("DELETE FROM transaction where created_at < DATE_SUB(now(),"
                 " INTERVAL {} day) LIMIT {}".format(days_ago, limit))
    try:
        # delete from rows until there are no more rows affected
        rowcount = 1
        while rowcount > 0:
            while throttle and check_throttle():
                log.info("Throttling deletion")
                gevent.sleep(60)
            with session_scope_by_shard_id(shard_id, versioned=False) as \
                    db_session:
                if dry_run:
                    rowcount = db_session.execute("{} OFFSET {}".
                                                  format(query, offset)).\
                                                    rowcount
                    offset += rowcount
                else:
                    rowcount = db_session.execute(query).rowcount
            log.info("Deleted batch from transaction table", batch_size=limit,
                     rowcount=rowcount)
        log.info("Finished purging transaction table for shard",
                 shard_id=shard_id, date_delta=days_ago)
    except Exception as e:
        log.critical("Exception encountered during deletion", exception=e)
Example #9
0
def get_accounts_to_delete(shard_id):
    ids_to_delete = []
    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id)
                         for acc in db_session.query(Account)
                         if acc.is_marked_for_deletion]
    return ids_to_delete
Example #10
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:
                query = db_session.query(ActionLog).join(Namespace).\
                    join(Account).\
                    filter(ActionLog.discriminator == 'actionlog',
                           ActionLog.status == 'pending',
                           Account.sync_should_run).\
                    order_by(ActionLog.id).\
                    options(contains_eager(ActionLog.namespace,
                                           Namespace.account))

                running_action_ids = [
                    worker.action_log_id for worker in self.workers
                ]
                if running_action_ids:
                    query = query.filter(~ActionLog.id.in_(running_action_ids))

                for log_entry in query:
                    namespace = log_entry.namespace
                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)
                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(
                        action_name=log_entry.action,
                        semaphore=semaphore,
                        action_log_id=log_entry.id,
                        record_id=log_entry.record_id,
                        account_id=namespace.account_id,
                        provider=namespace.account.verbose_provider,
                        retry_interval=self.retry_interval,
                        extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    worker.start()
Example #11
0
def purge_other_accounts(default_account):
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            db_session.query(Account).filter(
                Account.id != default_account.id).delete(
                    synchronize_session='fetch')
            db_session.commit()
Example #12
0
    def _index_transactions(self, namespace_ids=[]):
        """ index with filter """
        # index 'em
        for key in engine_manager.engines:
            shard_should_sleep = []
            with session_scope_by_shard_id(key) as db_session:
                txn_query = db_session.query(Transaction).filter(
                    Transaction.id > self.transaction_pointers[key],
                    Transaction.object_type == 'contact')
                if namespace_ids:
                    txn_query = txn_query.filter(
                        Transaction.namespace_id.in_(namespace_ids))
                transactions = txn_query\
                    .order_by(asc(Transaction.id)) \
                    .limit(self.chunk_size).all()

                # index up to chunk_size transactions
                should_sleep = False
                if transactions:
                    self.index(transactions, db_session)
                    oldest_transaction = min(transactions,
                                             key=lambda t: t.created_at)
                    current_timestamp = datetime.utcnow()
                    latency = (current_timestamp -
                               oldest_transaction.created_at).seconds
                    self._report_transactions_latency(latency)
                    new_pointer = transactions[-1].id
                    self.update_pointer(new_pointer, key, db_session)
                    db_session.commit()
                else:
                    should_sleep = True
            shard_should_sleep.append(should_sleep)
        if all(shard_should_sleep):
            log.info('sleeping')
            sleep(self.poll_interval)
Example #13
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:

                # Get the list of namespace ids with pending actions
                namespace_ids = [
                    ns_id[0] for ns_id in db_session.query(
                        ActionLog.namespace_id).filter(
                            ActionLog.discriminator == "actionlog",
                            ActionLog.status == "pending",
                        ).distinct()
                ]

                # Pick NUM_PARALLEL_ACCOUNTS randomly to make sure we're
                # executing actions equally for each namespace_id --- we
                # don't want a single account with 100k actions hogging
                # the action log.
                namespaces_to_process = []
                if len(namespace_ids) <= NUM_PARALLEL_ACCOUNTS:
                    namespaces_to_process = namespace_ids
                else:
                    namespaces_to_process = random.sample(
                        namespace_ids, NUM_PARALLEL_ACCOUNTS)
                for ns_id in namespaces_to_process:
                    # The discriminator filter restricts actions to IMAP. EAS
                    # uses a different system.
                    query = (db_session.query(ActionLog).filter(
                        ActionLog.discriminator == "actionlog",
                        ActionLog.status == "pending",
                        ActionLog.namespace_id == ns_id,
                    ).order_by(ActionLog.id).limit(self.fetch_batch_size))
                    task = self._batch_log_entries(db_session, query.all())
                    if task is not None:
                        self.task_queue.put(task)
Example #14
0
 def runnable_accounts(self):
     accounts = set()
     for key in self.shards:
         with session_scope_by_shard_id(key) as db_session:
             accounts.update(id_ for id_, in db_session.query(
                 Account.id).filter(Account.sync_should_run))
     return accounts
Example #15
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:
                query = db_session.query(ActionLog).join(Namespace).\
                    join(Account).\
                    filter(ActionLog.discriminator == 'actionlog',
                           ActionLog.status == 'pending',
                           Account.sync_should_run).\
                    order_by(ActionLog.id).\
                    options(contains_eager(ActionLog.namespace,
                                           Namespace.account))

                running_action_ids = [worker.action_log_id for worker in
                                      self.workers]
                if running_action_ids:
                    query = query.filter(~ActionLog.id.in_(running_action_ids))

                for log_entry in query:
                    namespace = log_entry.namespace
                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)
                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(action_name=log_entry.action,
                                            semaphore=semaphore,
                                            action_log_id=log_entry.id,
                                            record_id=log_entry.record_id,
                                            account_id=namespace.account_id,
                                            provider=namespace.account.verbose_provider,
                                            retry_interval=self.retry_interval,
                                            extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    worker.start()
Example #16
0
    def accounts_to_start(self):
        accounts = []
        for key in engine_manager.engines:
            with session_scope_by_shard_id(key) as db_session:
                start_on_this_cpu = self.account_cpu_filter(self.cpu_id,
                                                            self.total_cpus)
                if (self.stealing_enabled and
                        self.host in self.sync_hosts_for_shards[key]):
                    q = db_session.query(Account).filter(
                        Account.sync_host.is_(None),
                        Account.sync_should_run,
                        start_on_this_cpu)
                    unscheduled_accounts_exist = db_session.query(
                        q.exists()).scalar()
                    if unscheduled_accounts_exist:
                        # Atomically claim unscheduled syncs by setting
                        # sync_host.
                        q.update({'sync_host': self.host},
                                 synchronize_session=False)
                        db_session.commit()

                accounts.extend([id_ for id_, in
                                 db_session.query(Account.id).filter(
                                     Account.sync_should_run,
                                     Account.sync_host == self.host,
                                     start_on_this_cpu)])

                # Close the underlying connection rather than returning it to
                # the pool. This allows this query to run against all shards
                # without potentially acquiring a poorly-utilized, persistent
                # connection from each sync host to each shard.
                db_session.invalidate()
        return accounts
def purge_other_accounts(default_account):
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            db_session.query(Account).filter(
                Account.id != default_account.id).delete(
                    synchronize_session='fetch')
            db_session.commit()
Example #18
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:
                query = db_session.query(ActionLog). \
                    filter(ActionLog.discriminator == 'actionlog',
                           ActionLog.status == 'pending'). \
                    order_by(ActionLog.id)

                running_action_ids = {
                    worker.action_log_id
                    for worker in self.workers
                }
                for log_entry in query:
                    if log_entry.id in running_action_ids:
                        continue
                    namespace = log_entry.namespace
                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)
                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(
                        action_name=log_entry.action,
                        semaphore=semaphore,
                        action_log_id=log_entry.id,
                        record_id=log_entry.record_id,
                        account_id=namespace.account_id,
                        provider=namespace.account.verbose_provider,
                        retry_interval=self.retry_interval,
                        extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    worker.start()
Example #19
0
    def accounts_to_start(self):
        accounts = []
        for key in engine_manager.engines:
            with session_scope_by_shard_id(key) as db_session:
                start_on_this_cpu = self.account_cpu_filter(self.cpu_id,
                                                            self.total_cpus)
                if config.get('SYNC_STEAL_ACCOUNTS', True):
                    q = db_session.query(Account).filter(
                        Account.sync_host.is_(None),
                        Account.sync_should_run,
                        start_on_this_cpu)
                    unscheduled_accounts_exist = db_session.query(
                        q.exists()).scalar()
                    if unscheduled_accounts_exist:
                        # Atomically claim unscheduled syncs by setting
                        # sync_host.
                        q.update({'sync_host': self.host},
                                 synchronize_session=False)
                        db_session.commit()

                accounts.extend([id_ for id_, in
                                 db_session.query(Account.id).filter(
                                     Account.sync_should_run,
                                     Account.sync_host == self.host,
                                     start_on_this_cpu)])
        return accounts
Example #20
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:

                # Get the list of namespace ids with pending actions
                namespace_ids = [ns_id[0] for ns_id in db_session.query(ActionLog.namespace_id).filter(
                    ActionLog.discriminator == 'actionlog',
                    ActionLog.status == 'pending').distinct()]

                # Pick NUM_PARALLEL_ACCOUNTS randomly to make sure we're
                # executing actions equally for each namespace_id --- we
                # don't want a single account with 100k actions hogging
                # the action log.
                namespaces_to_process = []
                if len(namespace_ids) <= NUM_PARALLEL_ACCOUNTS:
                    namespaces_to_process = namespace_ids
                else:
                    namespaces_to_process = random.sample(namespace_ids,
                                                          NUM_PARALLEL_ACCOUNTS)
                for ns_id in namespaces_to_process:
                    # The discriminator filter restricts actions to IMAP. EAS
                    # uses a different system.
                    query = db_session.query(ActionLog).filter(
                        ActionLog.discriminator == 'actionlog',
                        ActionLog.status == 'pending',
                        ActionLog.namespace_id == ns_id).order_by(ActionLog.id).\
                        limit(self.batch_size)
                    task = self._batch_log_entries(db_session, query.all())
                    if task is not None:
                        self.task_queue.put(task)
Example #21
0
    def _index_transactions(self, namespace_ids=[]):
        """ index with filter """
        # index 'em
        for key in engine_manager.engines:
            with session_scope_by_shard_id(key) as db_session:
                txn_query = db_session.query(Transaction).filter(
                    Transaction.id > self.transaction_pointers[key],
                    Transaction.object_type == 'contact')
                if namespace_ids:
                    txn_query = txn_query.filter(
                        Transaction.namespace_id.in_(
                            namespace_ids))
                transactions = txn_query\
                    .order_by(asc(Transaction.id)) \
                    .limit(self.chunk_size).all()

                # index up to chunk_size transactions
                should_sleep = False
                if transactions:
                    self.index(transactions, db_session)
                    oldest_transaction = min(
                        transactions, key=lambda t: t.created_at)
                    current_timestamp = datetime.utcnow()
                    latency = (current_timestamp -
                                oldest_transaction.created_at).seconds
                    self._report_transactions_latency(latency)
                    new_pointer = transactions[-1].id
                    self.update_pointer(new_pointer, key, db_session)
                    db_session.commit()
                else:
                    should_sleep = True
            if should_sleep:
                log.info('sleeping')
                sleep(self.poll_interval)
Example #22
0
    def accounts_to_start(self):
        accounts = []
        for key in engine_manager.engines:
            with session_scope_by_shard_id(key) as db_session:
                start_on_this_cpu = self.account_cpu_filter(
                    self.cpu_id, self.total_cpus)
                if (self.stealing_enabled
                        and self.host in self.sync_hosts_for_shards[key]):
                    q = db_session.query(Account).filter(
                        Account.sync_host.is_(None), Account.sync_should_run,
                        start_on_this_cpu)
                    unscheduled_accounts_exist = db_session.query(
                        q.exists()).scalar()
                    if unscheduled_accounts_exist:
                        # Atomically claim unscheduled syncs by setting
                        # sync_host.
                        q.update({'sync_host': self.host},
                                 synchronize_session=False)
                        db_session.commit()

                accounts.extend([
                    id_ for id_, in db_session.query(Account.id).filter(
                        Account.sync_should_run, Account.sync_host ==
                        self.host, start_on_this_cpu)
                ])

                # Close the underlying connection rather than returning it to
                # the pool. This allows this query to run against all shards
                # without potentially acquiring a poorly-utilized, persistent
                # connection from each sync host to each shard.
                db_session.invalidate()
        return accounts
Example #23
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:
                query = db_session.query(ActionLog). \
                    filter(ActionLog.discriminator == 'actionlog',
                           ActionLog.status == 'pending'). \
                    order_by(ActionLog.id)

                running_action_ids = {worker.action_log_id for worker in
                                      self.workers}
                for log_entry in query:
                    if log_entry.id in running_action_ids:
                        continue
                    namespace = log_entry.namespace
                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)
                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(action_name=log_entry.action,
                                            semaphore=semaphore,
                                            action_log_id=log_entry.id,
                                            record_id=log_entry.record_id,
                                            account_id=namespace.account_id,
                                            provider=namespace.account.verbose_provider,
                                            retry_interval=self.retry_interval,
                                            extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    worker.start()
Example #24
0
 def runnable_accounts(self):
     accounts = set()
     for key in self.shards:
         with session_scope_by_shard_id(key) as db_session:
             accounts.update(
                 id_ for id_, in db_session.query(Account.id).filter(
                     Account.sync_should_run))
     return accounts
def purge_other_accounts(default_account=None):
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            q = db_session.query(Account)
            if default_account is not None:
                q = q.filter(Account.id != default_account.id)
            q.delete(synchronize_session="fetch")
            db_session.commit()
Example #26
0
def purge_other_accounts(default_account=None):
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            q = db_session.query(Account)
            if default_account is not None:
                q = q.filter(Account.id != default_account.id)
            q.delete(synchronize_session="fetch")
            db_session.commit()
Example #27
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id) for acc
                         in db_session.query(Account) if acc.is_deleted]

    queue_size = len(ids_to_delete)
    for account_id, namespace_id in ids_to_delete:
        # queue_size = length of queue
        # deleted_count = number of accounts deleted during loop iteration
        # this is necessary because the length of ids_to_delete doesn't
        # change during loop iteration
        statsd_client.gauge('mailsync.{}.account_deletion.queue.length'
                            .format(shard_id),
                            queue_size - deleted_count)
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn('Account NOT marked for deletion. '
                             'Will not delete', account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed', error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.incr('mailsync.account_deletion.queue.deleted', 1)
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully', shard_id=shard_id,
             time=end - start, count=deleted_count)
Example #28
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id)
                         for acc in db_session.query(Account)
                         if acc.is_deleted]

    for account_id, namespace_id in ids_to_delete:
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn(
                        'Account NOT marked for deletion. '
                        'Will not delete',
                        account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id,
                                 namespace_id,
                                 throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed',
                             error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully',
             shard_id=shard_id,
             time=end - start,
             count=deleted_count)
def test_actions_for_invalid_accounts_are_skipped(purge_accounts_and_actions,
                                                  patched_task):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(db_session,
                                           email_address='*****@*****.**')
        schedule_test_action(db_session, account)
        namespace_id = account.namespace.id
        count = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == namespace_id).count()
        assert account.sync_state != 'invalid'

        another_account = add_generic_imap_account(
            db_session, email_address='*****@*****.**')
        schedule_test_action(db_session, another_account)
        another_namespace_id = another_account.namespace.id
        another_count = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == another_namespace_id).count()
        assert another_account.sync_state != 'invalid'

        account.mark_invalid()
        db_session.commit()

    service = SyncbackService(syncback_id=0,
                              process_number=0,
                              total_processes=2,
                              num_workers=2)
    service._process_log()

    while not service.task_queue.empty():
        gevent.sleep(0)

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == namespace_id,
            ActionLog.status == 'pending')
        assert q.count() == count

        q = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == another_namespace_id)
        assert q.filter(ActionLog.status == 'pending').count() == 0
        assert q.filter(
            ActionLog.status == 'successful').count() == another_count
def test_stealing_limited_by_host(db, config):
    host = platform.node()
    config['DATABASE_HOSTS'][0]['SHARDS'][0]['SYNC_HOSTS'] = [host]
    config['DATABASE_HOSTS'][0]['SHARDS'][1]['SYNC_HOSTS'] = ['otherhost']
    purge_other_accounts()
    ss = SyncService(cpu_id=0, total_cpus=1)
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            acc = Account()
            acc.namespace = Namespace()
            db_session.add(acc)
            db_session.commit()

    ss.accounts_to_start()
    with session_scope_by_shard_id(0) as db_session:
        acc = db_session.query(Account).first()
        assert acc.sync_host == host
    with session_scope_by_shard_id(1) as db_session:
        acc = db_session.query(Account).first()
        assert acc.sync_host is None
def test_stealing_limited_by_host(db, config):
    host = platform.node()
    config['DATABASE_HOSTS'][0]['SHARDS'][0]['SYNC_HOSTS'] = [host]
    config['DATABASE_HOSTS'][0]['SHARDS'][1]['SYNC_HOSTS'] = ['otherhost']
    purge_other_accounts()
    ss = SyncService(cpu_id=0, total_cpus=1)
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            acc = Account()
            acc.namespace = Namespace()
            db_session.add(acc)
            db_session.commit()

    ss.accounts_to_start()
    with session_scope_by_shard_id(0) as db_session:
        acc = db_session.query(Account).first()
        assert acc.sync_host == host
    with session_scope_by_shard_id(1) as db_session:
        acc = db_session.query(Account).first()
        assert acc.sync_host is None
Example #32
0
def main(min_id, max_id, shard_id):
    maybe_enable_rollbar()

    generic_accounts = []
    failed = []

    if min_id is not None or max_id is not None:
        # Get the list of running Gmail accounts.
        with global_session_scope() as db_session:
            generic_accounts = db_session.query(GenericAccount).filter(
                GenericAccount.sync_state == "running")

            if min_id is not None:
                generic_accounts = generic_accounts.filter(
                    GenericAccount.id > min_id)

            if max_id is not None:
                generic_accounts = generic_accounts.filter(
                    GenericAccount.id <= max_id)

            generic_accounts = [acc.id for acc in generic_accounts]

            db_session.expunge_all()

    elif shard_id is not None:
        with session_scope_by_shard_id(shard_id) as db_session:
            generic_accounts = db_session.query(GenericAccount).filter(
                GenericAccount.sync_state == "running")

            generic_accounts = [acc.id for acc in generic_accounts]
            db_session.expunge_all()

    print("Total accounts", len(generic_accounts))

    for account_id in generic_accounts:
        try:
            with session_scope(account_id) as db_session:
                account = db_session.query(GenericAccount).get(account_id)
                print("Updating", account.email_address)

                with connection_pool(account.id).get() as crispin_client:
                    account.folder_prefix = crispin_client.folder_prefix
                    account.folder_separator = crispin_client.folder_separator

                db_session.commit()
        except Exception:
            failed.append(account_id)

    print("Processed accounts:")
    print(generic_accounts)

    print("Failed accounts:")
    print(failed)
def test_actions_for_invalid_accounts_are_skipped(purge_accounts_and_actions,
                                                  patched_worker):
    with session_scope_by_shard_id(0) as db_session:
        account = add_generic_imap_account(
            db_session, email_address='*****@*****.**')
        schedule_test_action(db_session, account)
        namespace_id = account.namespace.id
        count = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == namespace_id).count()
        assert account.sync_state != 'invalid'

        another_account = add_generic_imap_account(
            db_session, email_address='*****@*****.**')
        schedule_test_action(db_session, another_account)
        another_namespace_id = another_account.namespace.id
        another_count = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == another_namespace_id).count()
        assert another_account.sync_state != 'invalid'

        account.mark_invalid()
        db_session.commit()

    service = SyncbackService(
        syncback_id=0, process_number=0, total_processes=2)
    service._process_log()

    while len(service.workers) >= 1:
        gevent.sleep(0.1)
    gevent.killall(service.workers)

    with session_scope_by_shard_id(0) as db_session:
        q = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == namespace_id,
            ActionLog.status == 'pending')
        assert q.count() == count

        q = db_session.query(ActionLog).filter(
            ActionLog.namespace_id == another_namespace_id)
        assert q.filter(ActionLog.status == 'pending').count() == 0
        assert q.filter(ActionLog.status == 'successful').count() == another_count
def main(min_id, max_id, shard_id):
    generic_accounts = []
    failed = []

    if min_id is not None or max_id is not None:
        # Get the list of running Gmail accounts.
        with global_session_scope() as db_session:
            generic_accounts = db_session.query(GenericAccount).filter(
                GenericAccount.sync_state == 'running')

            if min_id is not None:
                generic_accounts = generic_accounts.filter(
                    GenericAccount.id > min_id)

            if max_id is not None:
                generic_accounts = generic_accounts.filter(
                    GenericAccount.id <= max_id)

            generic_accounts = [acc.id for acc in generic_accounts]

            db_session.expunge_all()

    elif shard_id is not None:
        with session_scope_by_shard_id(shard_id) as db_session:
            generic_accounts = db_session.query(GenericAccount).filter(
                GenericAccount.sync_state == 'running')

            generic_accounts = [acc.id for acc in generic_accounts]
            db_session.expunge_all()

    print "Total accounts: %d" % len(generic_accounts)

    for account_id in generic_accounts:
        try:
            with session_scope(account_id) as db_session:
                account = db_session.query(GenericAccount).get(account_id)
                print "Updating %s" % account.email_address

                with connection_pool(account.id).get() as crispin_client:
                    account.folder_prefix = crispin_client.folder_prefix
                    account.folder_separator = crispin_client.folder_separator

                db_session.commit()
        except Exception:
            failed.append(account_id)

    print "Processed accounts:"
    print generic_accounts

    print "Failed accounts:"
    print failed
Example #35
0
    def _process_log(self):
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:
                query = db_session.query(ActionLog).filter(
                    ActionLog.discriminator == 'actionlog',
                    ActionLog.status == 'pending').order_by(ActionLog.id).\
                    limit(500)

                running_action_ids = {
                    worker.action_log_id
                    for worker in self.workers
                }
                for log_entry in query:
                    if log_entry.id in running_action_ids:
                        continue
                    namespace = log_entry.namespace
                    if namespace.account.sync_state == 'invalid':
                        self.log.warning('Skipping action for invalid account',
                                         account_id=namespace.account.id,
                                         action_id=log_entry.id,
                                         action=log_entry.action)
                        continue
                    ### BEGIN SHIM ###
                    # Shim until sync-syncback integration fully deployed.
                    sync_host = namespace.account.sync_host
                    if (sync_host and sync_host.split(':')[0]
                            in SKIP_SYNCBACK_FOR_HOSTS):
                        self.log.info('SyncbackService not delegating for',
                                      account_id=namespace.account.id,
                                      sync_host=sync_host,
                                      action_id=log_entry.id)
                        continue
                    ### END SHIM ###
                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)
                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(
                        action_name=log_entry.action,
                        semaphore=semaphore,
                        action_log_id=log_entry.id,
                        record_id=log_entry.record_id,
                        account_id=namespace.account_id,
                        provider=namespace.account.verbose_provider,
                        retry_interval=self.retry_interval,
                        extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    worker.start()
Example #36
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id) for acc
                         in db_session.query(Account) if acc.is_deleted]

    for account_id, namespace_id in ids_to_delete:
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn('Account NOT marked for deletion. '
                             'Will not delete', account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id, namespace_id, throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed', error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
            gevent.sleep(60)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully', shard_id=shard_id,
             time=end - start, count=deleted_count)
Example #37
0
 def _set_transaction_pointers(self):
     for key in engine_manager.engines:
         with session_scope_by_shard_id(key) as db_session:
             pointer = db_session.query(ContactSearchIndexCursor).first()
             if pointer:
                 self.transaction_pointers[key] = pointer.transaction_id
             else:
                 # Never start from 0; if the service hasn't run before
                 # start from the latest transaction, with the expectation
                 # that a backfill will be run separately.
                 max_id = db_session.query(func.max(Transaction.id)).scalar() or 0
                 latest_transaction = db_session.query(Transaction).get(max_id)
                 if latest_transaction:
                     self.transaction_pointers[key] = latest_transaction.id
                 else:
                     self.transaction_pointers[key] = 0
Example #38
0
    def _process_log(self):
        before = datetime.utcnow()
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:

                # Get the list of namespace ids with pending actions
                namespace_ids = [
                    ns_id[0]
                    for ns_id in db_session.query(ActionLog.namespace_id).
                    filter(ActionLog.discriminator == 'actionlog',
                           ActionLog.status == 'pending').distinct()
                ]

                # Pick NUM_PARALLEL_ACCOUNTS randomly to make sure we're
                # executing actions equally for each namespace_id --- we
                # don't want a single account with 100k actions hogging
                # the action log.
                namespaces_to_process = []
                if len(namespace_ids) <= NUM_PARALLEL_ACCOUNTS:
                    namespaces_to_process = namespace_ids
                else:
                    namespaces_to_process = random.sample(
                        namespace_ids, NUM_PARALLEL_ACCOUNTS)
                self.log.debug('Syncback namespace_ids count',
                               shard_id=key,
                               process=self.process_number,
                               num_namespace_ids=len(namespace_ids))

                for ns_id in namespaces_to_process:
                    # The discriminator filter restricts actions to IMAP. EAS
                    # uses a different system.
                    query = db_session.query(ActionLog).filter(
                        ActionLog.discriminator == 'actionlog',
                        ActionLog.status == 'pending',
                        ActionLog.namespace_id == ns_id).order_by(ActionLog.id).\
                        limit(self.batch_size)
                    task = self._batch_log_entries(db_session, query.all())
                    if task is not None:
                        self.task_queue.put(task)

        after = datetime.utcnow()
        self.log.debug('Syncback completed one iteration',
                       process=self.process_number,
                       duration=(after - before).total_seconds(),
                       idle_workers=self.num_idle_workers)
Example #39
0
 def _set_transaction_pointers(self):
     for key in engine_manager.engines:
         with session_scope_by_shard_id(key) as db_session:
             pointer = db_session.query(ContactSearchIndexCursor).first()
             if pointer:
                 self.transaction_pointers[key] = pointer.transaction_id
             else:
                 # Never start from 0; if the service hasn't run before
                 # start from the latest transaction, with the expectation
                 # that a backfill will be run separately.
                 max_id = db_session.query(func.max(
                     Transaction.id)).scalar() or 0
                 latest_transaction = \
                     db_session.query(Transaction).get(max_id)
                 if latest_transaction:
                     self.transaction_pointers[key] = latest_transaction.id
                 else:
                     self.transaction_pointers[key] = 0
def test_actions_claimed_by_a_single_service(purge_accounts_and_actions,
                                             patched_worker):
    actionlogs = []
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            account = make_imap_account(db_session, '{}@test.com'.format(key))
            schedule_test_action(db_session, account)
            actionlogs += [db_session.query(ActionLog).one().id]

    services = []
    for cpu_id in (0, 1):
        service = SyncbackService(cpu_id=cpu_id, total_cpus=2)
        service.workers = set()
        service._process_log()
        services.append(service)

    for i, service in enumerate(services):
        assert len(service.workers) == 1
        assert list(service.workers)[0].action_log_id == actionlogs[i]
        gevent.joinall(list(service.workers))
Example #41
0
def page_over_shards(Model, cursor, limit, get_results=lambda q: q.all()):
    # TODO revisit passing lambda, and cursor format
    cursor = int(cursor)
    start_shard_id = engine_manager.shard_key_for_id(cursor)
    results = []
    remaining_limit = limit
    next_cursor = None
    for shard_id in sorted(engine_manager.engines):
        if shard_id < start_shard_id:
            continue

        if len(results) >= limit:
            break

        with session_scope_by_shard_id(shard_id) as mailsync_session:
            latest_cursor = cursor if shard_id == start_shard_id else None
            query = mailsync_session.query(Model)
            if latest_cursor:
                query = query.filter(Model.id > latest_cursor)
            query = query.order_by(asc(Model.id)).limit(remaining_limit)
            latest_results = get_results(query)

            if latest_results:
                results.extend(latest_results)
                last = latest_results[-1]
                if hasattr(last, "id"):
                    next_cursor = last.id
                elif "id" in last:
                    next_cursor = last["id"]
                else:
                    raise ValueError(
                        "Results returned from get_query must" "have an id"
                    )

                # Handle invalid ids
                cursor_implied_shard = next_cursor >> 48
                if shard_id != 0 and cursor_implied_shard == 0:
                    next_cursor += shard_id << 48

                remaining_limit -= len(latest_results)
    return results, str(next_cursor)
Example #42
0
    def _process_log(self):
        before = datetime.utcnow()
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:

                # Get the list of namespace ids with pending actions
                namespace_ids = [ns_id[0] for ns_id in db_session.query(ActionLog.namespace_id).filter(
                    ActionLog.discriminator == 'actionlog',
                    ActionLog.status == 'pending').distinct()]

                # Pick NUM_PARALLEL_ACCOUNTS randomly to make sure we're
                # executing actions equally for each namespace_id --- we
                # don't want a single account with 100k actions hogging
                # the action log.
                namespaces_to_process = []
                if len(namespace_ids) <= NUM_PARALLEL_ACCOUNTS:
                    namespaces_to_process = namespace_ids
                else:
                    namespaces_to_process = random.sample(namespace_ids,
                                                          NUM_PARALLEL_ACCOUNTS)
                self.log.debug('Syncback namespace_ids count', shard_id=key,
                               process=self.process_number,
                               num_namespace_ids=len(namespace_ids))

                for ns_id in namespaces_to_process:
                    # The discriminator filter restricts actions to IMAP. EAS
                    # uses a different system.
                    query = db_session.query(ActionLog).filter(
                        ActionLog.discriminator == 'actionlog',
                        ActionLog.status == 'pending',
                        ActionLog.namespace_id == ns_id).order_by(ActionLog.id).\
                        limit(self.batch_size)
                    task = self._batch_log_entries(db_session, query.all())
                    if task is not None:
                        self.task_queue.put(task)

        after = datetime.utcnow()
        self.log.debug('Syncback completed one iteration',
                       process=self.process_number,
                       duration=(after - before).total_seconds(),
                       idle_workers=self.num_idle_workers)
Example #43
0
def page_over_shards(Model, cursor, limit, get_results=lambda q: q.all()):
    # TODO revisit passing lambda, and cursor format
    cursor = int(cursor)
    start_shard_id = engine_manager.shard_key_for_id(cursor)
    results = []
    remaining_limit = limit
    next_cursor = None
    for shard_id in sorted(engine_manager.engines):
        if shard_id < start_shard_id:
            continue

        if len(results) >= limit:
            break

        with session_scope_by_shard_id(shard_id) as mailsync_session:
            latest_cursor = cursor if shard_id == start_shard_id else None
            query = mailsync_session.query(Model)
            if latest_cursor:
                query = query.filter(Model.id > latest_cursor)
            query = query.order_by(asc(Model.id)).limit(remaining_limit)
            latest_results = get_results(query)

            if latest_results:
                results.extend(latest_results)
                last = latest_results[-1]
                if hasattr(last, 'id'):
                    next_cursor = last.id
                elif 'id' in last:
                    next_cursor = last['id']
                else:
                    raise ValueError('Results returned from get_query must'
                                     'have an id')

                # Handle invalid ids
                cursor_implied_shard = next_cursor >> 48
                if shard_id != 0 and cursor_implied_shard == 0:
                    next_cursor += shard_id << 48

                remaining_limit -= len(latest_results)
    return results, str(next_cursor)
def test_accounts_started_on_all_shards(db, default_account, config):
    config['SYNC_STEAL_ACCOUNTS'] = True
    purge_other_accounts(default_account)
    default_account.sync_host = None
    db.session.commit()
    ss = SyncService(cpu_id=0, total_cpus=1)
    ss.host = 'localhost'
    account_ids = {default_account.id}
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            acc = Account()
            acc.namespace = Namespace()
            db_session.add(acc)
            db_session.commit()
            account_ids.add(acc.id)

    assert len(account_ids) == 3
    assert set(ss.accounts_to_start()) == account_ids
    for id_ in account_ids:
        with session_scope(id_) as db_session:
            acc = db_session.query(Account).get(id_)
            assert acc.sync_host == 'localhost'
def test_actions_claimed_by_a_single_service(purge_accounts_and_actions,
                                             patched_task):
    actionlogs = []
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            account = add_generic_imap_account(
                db_session, email_address='{}@test.com'.format(key))
            schedule_test_action(db_session, account)
            actionlogs += [db_session.query(ActionLog).one().id]

    services = []
    for process_number in (0, 1):
        service = SyncbackService(syncback_id=0,
                                  process_number=process_number,
                                  total_processes=2,
                                  num_workers=2)
        service._process_log()
        services.append(service)

    for i, service in enumerate(services):
        assert service.task_queue.qsize() == 1
        assert service.task_queue.peek().action_log_ids() == [actionlogs[i]]
Example #46
0
def test_actions_claimed_by_a_single_service(purge_accounts_and_actions,
                                             patched_task):
    actionlogs = []
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            account = add_generic_imap_account(
                db_session,
                email_address='{}@test.com'.format(key))
            schedule_test_action(db_session, account)
            actionlogs += [db_session.query(ActionLog).one().id]

    services = []
    for process_number in (0, 1):
        service = SyncbackService(
            syncback_id=0, process_number=process_number, total_processes=2,
            num_workers=2)
        service._process_log()
        services.append(service)

    for i, service in enumerate(services):
        assert service.task_queue.qsize() == 1
        assert service.task_queue.peek().action_log_ids() == [actionlogs[i]]
Example #47
0
def test_accounts_started_on_all_shards(db, default_account, config):
    config['SYNC_STEAL_ACCOUNTS'] = True
    purge_other_accounts(default_account)
    default_account.sync_host = None
    db.session.commit()
    ss = SyncService(cpu_id=0, total_cpus=1)
    ss.host = 'localhost'
    account_ids = {default_account.id}
    for key in (0, 1):
        with session_scope_by_shard_id(key) as db_session:
            acc = Account()
            acc.namespace = Namespace()
            db_session.add(acc)
            db_session.commit()
            account_ids.add(acc.id)

    assert len(account_ids) == 3
    assert set(ss.accounts_to_start()) == account_ids
    for id_ in account_ids:
        with session_scope(id_) as db_session:
            acc = db_session.query(Account).get(id_)
            assert acc.sync_host == 'localhost'
Example #48
0
def main():
    """ Generate per-shard and per-namespace breakdowns of syncback queue
    lengths.

    """
    maybe_enable_rollbar()

    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            total_pending_actions = 0
            for c, namespace_id in (db_session.query(
                    func.count(ActionLog.namespace_id),
                    ActionLog.namespace_id).join(Namespace).join(
                        Account).filter(
                            ActionLog.discriminator == "actionlog",
                            Account.sync_state != "invalid",
                            ActionLog.status == "pending",
                        ).group_by(ActionLog.namespace_id)):
                print(
                    "{} (pending actions), {} (shard), {} (namespace)".format(
                        c, key, namespace_id))
                total_pending_actions += c
            print("total pending actions for shard {}: {}".format(
                key, total_pending_actions))
Example #49
0
def purge_accounts_and_actions():
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            db_session.query(ActionLog).delete(synchronize_session=False)
            db_session.query(Account).delete(synchronize_session=False)
            db_session.commit()
def process_shard(shard_id, dry_run, id_start=0):
    # At 500K events, we need to process 6 events per second to finish within a day.
    batch_size = 100
    rps = 6 / batch_size
    window = 5

    throttle = limitlion.throttle_wait("create-event-contact-associations",
                                       rps=rps,
                                       window=window)

    with session_scope_by_shard_id(shard_id) as db_session:
        # NOTE: The session is implicitly autoflushed, which ensures no
        # duplicate contacts are created.

        n = 0
        n_skipped = 0
        n_updated = 0

        while True:
            event_query = list(
                db_session.query(Event).filter(Event.id > id_start).order_by(
                    asc(Event.id)).limit(batch_size))

            if not event_query:
                break

            for event in event_query:
                n += 1
                id_start = event.id

                if n % batch_size == 0:
                    log.info(
                        "progress",
                        shard_id=shard_id,
                        id_start=id_start,
                        n=n,
                        n_skipped=n_skipped,
                        n_updated=n_updated,
                    )

                if event.contacts:
                    continue

                if not dry_run:
                    event.contacts = []
                    update_contacts_from_event(db_session, event,
                                               event.namespace_id)
                    n_updated += 1

                    if n_updated % batch_size == 0:
                        db_session.commit()
                        log.info(
                            "committed",
                            shard_id=shard_id,
                            n=n,
                            n_skipped=n_skipped,
                            n_updated=n_updated,
                        )
                        throttle()

    log.info("finished",
             shard_id=shard_id,
             n=n,
             n_skipped=n_skipped,
             n_updated=n_updated)
Example #51
0
def get_accounts_to_delete(shard_id):
    ids_to_delete = []
    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id) for acc
                         in db_session.query(Account) if acc.is_deleted]
    return ids_to_delete
Example #52
0
def purge_transactions(shard_id,
                       days_ago=60,
                       limit=1000,
                       throttle=False,
                       dry_run=False,
                       now=None):
    start = "now()"
    if now is not None:
        start = "'{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"))

    # Delete all items from the transaction table that are older than
    # `days_ago` days.
    if dry_run:
        offset = 0
        query = ("SELECT id FROM transaction where created_at < "
                 "DATE_SUB({}, INTERVAL {} day) LIMIT {}".format(
                     start, days_ago, limit))
    else:
        query = ("DELETE FROM transaction where created_at < DATE_SUB({},"
                 " INTERVAL {} day) LIMIT {}".format(start, days_ago, limit))
    try:
        # delete from rows until there are no more rows affected
        rowcount = 1
        while rowcount > 0:
            if throttle:
                bulk_throttle()

            with session_scope_by_shard_id(shard_id,
                                           versioned=False) as db_session:
                if dry_run:
                    rowcount = db_session.execute("{} OFFSET {}".format(
                        query, offset)).rowcount
                    offset += rowcount
                else:
                    rowcount = db_session.execute(query).rowcount
            log.info(
                "Deleted batch from transaction table",
                batch_size=limit,
                rowcount=rowcount,
            )
        log.info(
            "Finished purging transaction table for shard",
            shard_id=shard_id,
            date_delta=days_ago,
        )
    except Exception as e:
        log.critical("Exception encountered during deletion", exception=e)

    # remove old entries from the redis transaction zset
    if dry_run:
        # no dry run for removing things from a redis zset
        return
    try:
        with session_scope_by_shard_id(shard_id,
                                       versioned=False) as db_session:
            (min_txn_id, ) = db_session.query(func.min(Transaction.id)).one()
        redis_txn.zremrangebyscore(
            TXN_REDIS_KEY,
            "-inf",
            "({}".format(min_txn_id) if min_txn_id is not None else "+inf",
        )
        log.info(
            "Finished purging transaction entries from redis",
            min_id=min_txn_id,
            date_delta=days_ago,
        )
    except Exception as e:
        log.critical("Exception encountered during deletion", exception=e)
def backfix_shard(shard_id, dry_run):
    categories_to_fix = []
    with session_scope_by_shard_id(shard_id) as db_session:
        # 'SELECT id FROM <table> GROUP BY <x>' does not select _all_ of the
        # ids in the group. MySQL chooses one id and returns it. The id chosen
        # is indeterminate. So we find the duplicate
        # (namespace_id, display_name, name) pairs and use them to query
        # for specific Category rows
        category_query = db_session.query(Category.namespace_id,
                                          Category.display_name,
                                          Category.name)

        duplicate_attrs = category_query. \
                group_by(Category.display_name,
                         Category.namespace_id,
                         Category.name).having(
                            func.count(Category.id) > 1).all()

    for namespace_id, display_name, name in duplicate_attrs:
        duplicates = db_session.query(Category.id). \
                filter(Category.namespace_id == namespace_id,
                       Category.display_name == display_name,
                       Category.name == name).all()

        # duplicates is an array of tuples where each tuple is
        # (Category.id,). We flatten the tuples here so that each item in
        # categories_to_fix is a list of category ids that are duplicates
        categories_to_fix.append([item for item in chain(*duplicates)])

    categories_affected = 0
    categories_to_delete = []
    # Categories_to_fix is a list of tuples where each tuple
    # contains the duplicate categories
    for grouped_categories in categories_to_fix:
        # Keep track of categories with associated message categories
        categories_with_messages = []

        # It is possible for Messages to be associated with
        # more than one category. We choose the Category with
        # the lowest pk to be the "master" and all other duplicate
        # categories are deleted and their messages consolidated
        # into the master
        grouped_categories.sort()
        master_id = grouped_categories[0]
	categories_affected += len(grouped_categories)

        # Iterate over all of the duplicate categories except master
        for category_id in grouped_categories[1:]:
            with session_scope_by_shard_id(shard_id) as db_session:
                associated_messages = db_session.query(exists().where(
                    MessageCategory.category_id == category_id)).scalar()

                # if category has messages, they need to be de-duped
                # and consolidated
                if associated_messages:
                    log.info('Category has associated messages',
                             category_id=category_id)
                    categories_with_messages.append(category_id)

                # if category does not have messages, it can be deleted
                else:
                    categories_to_delete.append(category_id)
                    log.info('Category does not have associated messages',
                             category_id=category_id)

        if len(categories_with_messages) > 0:
            log.info('Consolidating messages into category',
                     category_id=master_id)

            for category_id in categories_with_messages:
                try:
                    with session_scope_by_shard_id(shard_id) as db_session:
                        messagecategories = db_session.query(MessageCategory).\
                                filter(MessageCategory.category_id == category_id).all()  # noqa

                        for mc in messagecategories:
                            # Its possible for a message to be associated with
                            # what we've declared to be the master category
                            # and the category we want to delete.
                            # MessageCategory has a unique constraint on
                            # (message_id, category_id) so we first query to
                            # see such an object exists. If it does, we
                            # point the MessageCategory to the master
                            # category. If it does not, then simply delete it
                            mc_exists = db_session.query(exists().where(and_(
                                MessageCategory.category_id == master_id,
                                MessageCategory.message_id == mc.message_id)))\
                                        .scalar()

                            if not dry_run:
                                # If mc_exists == True, then there's a
                                # MessageCategory associated with the master
                                # and the current category, so we can delete
                                # the current category
                                if mc_exists:
                                    db_session.query(MessageCategory).filter_by(id=mc.id).delete()
                                else:
                                    # Master does not have a MessageCategory
                                    # for this message. Update this one to
                                    # point to the master
                                    mc.category_id = master_id
                                db_session.commit()

                            log.info('Updated MessageCategory', mc_id=mc.id,
                                     old_category_id=mc.category_id,
                                     new_category_id=master_id)

                    categories_to_delete.append(category_id)
                except Exception as e:
                    log.critical('Exception encountered while consolidating'
                                 ' messagecategories', e=str(e))
                    raise e

            # We REALLY don't want to delete the category we consolidated all
            # of the messagecategories into
            assert master_id not in categories_to_delete

        for category_id in categories_to_delete:
            if dry_run:
                log.info('Delete category', category_id=category_id)
                continue

            with session_scope_by_shard_id(shard_id) as db_session:
                db_session.query(Category).filter_by(id=category_id).delete()
                log.info('Deleted category', category_id=category_id)

            categories_to_delete.remove(category_id)

    log.info('Completed category migration on shard',
             categories_affected=categories_affected, shard_id=shard_id)
def backfix_shard(shard_id, dry_run):
    categories_to_fix = []
    with session_scope_by_shard_id(shard_id) as db_session:
        # 'SELECT id FROM <table> GROUP BY <x>' does not select _all_ of the
        # ids in the group. MySQL chooses one id and returns it. The id chosen
        # is indeterminate. So we find the duplicate
        # (namespace_id, display_name, name) pairs and use them to query
        # for specific Category rows
        category_query = db_session.query(Category.namespace_id,
                                          Category.display_name, Category.name)

        duplicate_attrs = (category_query.group_by(
            Category.display_name, Category.namespace_id,
            Category.name).having(func.count(Category.id) > 1).all())

    for namespace_id, display_name, name in duplicate_attrs:
        duplicates = (db_session.query(Category.id).filter(
            Category.namespace_id == namespace_id,
            Category.display_name == display_name,
            Category.name == name,
        ).all())

        # duplicates is an array of tuples where each tuple is
        # (Category.id,). We flatten the tuples here so that each item in
        # categories_to_fix is a list of category ids that are duplicates
        categories_to_fix.append([item for item in chain(*duplicates)])

    categories_affected = 0
    categories_to_delete = []
    # Categories_to_fix is a list of tuples where each tuple
    # contains the duplicate categories
    for grouped_categories in categories_to_fix:
        # Keep track of categories with associated message categories
        categories_with_messages = []

        # It is possible for Messages to be associated with
        # more than one category. We choose the Category with
        # the lowest pk to be the "master" and all other duplicate
        # categories are deleted and their messages consolidated
        # into the master
        grouped_categories.sort()
        master_id = grouped_categories[0]
        categories_affected += len(grouped_categories)

        # Iterate over all of the duplicate categories except master
        for category_id in grouped_categories[1:]:
            with session_scope_by_shard_id(shard_id) as db_session:
                associated_messages = db_session.query(exists().where(
                    MessageCategory.category_id == category_id)).scalar()

                # if category has messages, they need to be de-duped
                # and consolidated
                if associated_messages:
                    log.info("Category has associated messages",
                             category_id=category_id)
                    categories_with_messages.append(category_id)

                # if category does not have messages, it can be deleted
                else:
                    categories_to_delete.append(category_id)
                    log.info(
                        "Category does not have associated messages",
                        category_id=category_id,
                    )

        if len(categories_with_messages) > 0:
            log.info("Consolidating messages into category",
                     category_id=master_id)

            for category_id in categories_with_messages:
                try:
                    with session_scope_by_shard_id(shard_id) as db_session:
                        messagecategories = (
                            db_session.query(MessageCategory).filter(
                                MessageCategory.category_id ==
                                category_id).all())  # noqa

                        for mc in messagecategories:
                            # Its possible for a message to be associated with
                            # what we've declared to be the master category
                            # and the category we want to delete.
                            # MessageCategory has a unique constraint on
                            # (message_id, category_id) so we first query to
                            # see such an object exists. If it does, we
                            # point the MessageCategory to the master
                            # category. If it does not, then simply delete it
                            mc_exists = db_session.query(exists().where(
                                and_(
                                    MessageCategory.category_id == master_id,
                                    MessageCategory.message_id ==
                                    mc.message_id,
                                ))).scalar()

                            if not dry_run:
                                # If mc_exists == True, then there's a
                                # MessageCategory associated with the master
                                # and the current category, so we can delete
                                # the current category
                                if mc_exists:
                                    db_session.query(
                                        MessageCategory).filter_by(
                                            id=mc.id).delete()
                                else:
                                    # Master does not have a MessageCategory
                                    # for this message. Update this one to
                                    # point to the master
                                    mc.category_id = master_id
                                db_session.commit()

                            log.info(
                                "Updated MessageCategory",
                                mc_id=mc.id,
                                old_category_id=mc.category_id,
                                new_category_id=master_id,
                            )

                    categories_to_delete.append(category_id)
                except Exception as e:
                    log.critical(
                        "Exception encountered while consolidating"
                        " messagecategories",
                        e=str(e),
                    )
                    raise e

            # We REALLY don't want to delete the category we consolidated all
            # of the messagecategories into
            assert master_id not in categories_to_delete

        for category_id in categories_to_delete:
            if dry_run:
                log.info("Delete category", category_id=category_id)
                continue

            with session_scope_by_shard_id(shard_id) as db_session:
                db_session.query(Category).filter_by(id=category_id).delete()
                log.info("Deleted category", category_id=category_id)

            categories_to_delete.remove(category_id)

    log.info(
        "Completed category migration on shard",
        categories_affected=categories_affected,
        shard_id=shard_id,
    )
Example #55
0
def delete_marked_accounts(shard_id, throttle=False, dry_run=False):
    start = time.time()
    deleted_count = 0
    ids_to_delete = []

    with session_scope_by_shard_id(shard_id) as db_session:
        ids_to_delete = [(acc.id, acc.namespace.id)
                         for acc in db_session.query(Account)
                         if acc.is_deleted]

    queue_size = len(ids_to_delete)
    for account_id, namespace_id in ids_to_delete:
        # queue_size = length of queue
        # deleted_count = number of accounts deleted during loop iteration
        # this is necessary because the length of ids_to_delete doesn't
        # change during loop iteration
        statsd_client.gauge(
            'mailsync.{}.account_deletion.queue.length'.format(shard_id),
            queue_size - deleted_count)
        try:
            with session_scope(namespace_id) as db_session:
                account = db_session.query(Account).get(account_id)
                if not account:
                    log.critical('Account with does not exist',
                                 account_id=account_id)
                    continue

                if account.sync_should_run or not account.is_deleted:
                    log.warn(
                        'Account NOT marked for deletion. '
                        'Will not delete',
                        account_id=account_id)
                    continue

            log.info('Deleting account', account_id=account_id)
            start_time = time.time()
            # Delete data in database
            try:
                log.info('Deleting database data', account_id=account_id)
                delete_namespace(account_id,
                                 namespace_id,
                                 throttle=throttle,
                                 dry_run=dry_run)
            except Exception as e:
                log.critical('Database data deletion failed',
                             error=e,
                             account_id=account_id)
                continue

            # Delete liveness data
            log.debug('Deleting liveness data', account_id=account_id)
            clear_heartbeat_status(account_id)
            deleted_count += 1
            statsd_client.incr('mailsync.account_deletion.queue.deleted', 1)
            statsd_client.timing('mailsync.account_deletion.queue.deleted',
                                 time.time() - start_time)
        except Exception:
            log_uncaught_errors(log, account_id=account_id)

    end = time.time()
    log.info('All data deleted successfully',
             shard_id=shard_id,
             time=end - start,
             count=deleted_count)
Example #56
0
    def _process_log(self):
        before = datetime.utcnow()
        for key in self.keys:
            with session_scope_by_shard_id(key) as db_session:

                # Get the list of namespace ids with pending actions
                namespace_ids = [ns_id[0] for ns_id in db_session.query(ActionLog.namespace_id).filter(
                    ActionLog.discriminator == 'actionlog',
                    ActionLog.status == 'pending').distinct()]

                running_action_ids = {worker.action_log_id for worker in
                                      self.workers}

                # Pick NUM_PARALLEL_ACCOUNTS randomly to make sure we're
                # executing actions equally for each namespace_id --- we
                # don't want a single account with 100k actions hogging
                # the action log.
                namespaces_to_process = []
                if len(namespace_ids) <= NUM_PARALLEL_ACCOUNTS:
                    namespaces_to_process = namespace_ids
                else:
                    namespaces_to_process = random.sample(namespace_ids,
                                                          NUM_PARALLEL_ACCOUNTS)
                self.log.info('Syncback namespace_ids count', shard_id=key,
                              process=self.process_number,
                              num_namespace_ids=len(namespace_ids))

                for ns_id in namespaces_to_process:
                    # The discriminator filter restricts actions to IMAP. EAS
                    # uses a different system.
                    query = db_session.query(ActionLog).filter(
                        ActionLog.discriminator == 'actionlog',
                        ActionLog.status == 'pending',
                        ActionLog.namespace_id == ns_id).order_by(ActionLog.id).\
                        limit(1)

                    log_entry = query.first()

                    if log_entry is None:
                        self.log.error('Got a non-existing action, skipping')
                        continue

                    if log_entry.id in running_action_ids:
                        self.log.info('Skipping already running action',
                                      action_id=log_entry.id)
                        continue

                    namespace = log_entry.namespace
                    if namespace.account.sync_state == 'invalid':
                        self.log.warning('Skipping action for invalid account',
                                         account_id=namespace.account.id,
                                         action_id=log_entry.id,
                                         action=log_entry.action)

                        action_age = (datetime.utcnow() -
                                      log_entry.created_at).total_seconds()

                        if action_age > INVALID_ACCOUNT_GRACE_PERIOD:
                            log_entry.status = 'failed'
                            db_session.commit()
                            self.log.warning('Marking action as failed for '
                                             'invalid account, older than '
                                             'grace period',
                                             account_id=namespace.account.id,
                                             action_id=log_entry.id,
                                             action=log_entry.action)
                            statsd_client.incr('syncback.invalid_failed.total')
                            statsd_client.incr('syncback.invalid_failed.{}'.
                                               format(namespace.account.id))
                        continue

                    self.log.info('delegating action',
                                  action_id=log_entry.id,
                                  msg=log_entry.action)

                    semaphore = self.account_semaphores[namespace.account_id]
                    worker = SyncbackWorker(action_name=log_entry.action,
                                            semaphore=semaphore,
                                            action_log_id=log_entry.id,
                                            record_id=log_entry.record_id,
                                            account_id=namespace.account_id,
                                            provider=namespace.account.
                                            verbose_provider,
                                            service=self,
                                            retry_interval=self.retry_interval,
                                            extra_args=log_entry.extra_args)
                    self.workers.add(worker)
                    self.log.info('Syncback added worker',
                                  process=self.process_number,
                                  worker_count=len(self.workers))
                    worker.start()
        after = datetime.utcnow()
        self.log.info('Syncback completed one iteration',
                      process=self.process_number,
                      duration=(after - before).total_seconds())
def purge_accounts_and_actions():
    for key in engine_manager.engines:
        with session_scope_by_shard_id(key) as db_session:
            db_session.query(Account).delete(synchronize_session='fetch')
            db_session.query(ActionLog).delete(synchronize_session='fetch')
            db_session.commit()