def trim_webhook_event_task(): """ Runs daily and clears any webhoook events older than settings.SUCCESS_LOGS_TRIM_TIME(default: 48) hours. """ # keep success messages for only SUCCESS_LOGS_TRIM_TIME hours success_logs_trim_time = settings.SUCCESS_LOGS_TRIM_TIME # keep errors for ALL_LOGS_TRIM_TIME days all_logs_trim_time = settings.ALL_LOGS_TRIM_TIME if success_logs_trim_time: success_log_later = timezone.now() - timedelta( hours=success_logs_trim_time) event_ids = WebHookEvent.objects.filter( created_on__lte=success_log_later, status=WebHookEvent.STATUS_COMPLETE) event_ids = event_ids.values_list('id', flat=True) for batch in chunk_list(event_ids, 1000): WebHookEvent.objects.filter(id__in=batch).delete() if all_logs_trim_time: all_log_later = timezone.now() - timedelta(hours=all_logs_trim_time) event_ids = WebHookEvent.objects.filter(created_on__lte=all_log_later) event_ids = event_ids.values_list('id', flat=True) for batch in chunk_list(event_ids, 1000): WebHookEvent.objects.filter(id__in=batch).delete()
def trim_webhook_event_task(): """ Runs daily and clears any webhoook events older than settings.SUCCESS_LOGS_TRIM_TIME(default: 48) hours. """ # keep success messages for only SUCCESS_LOGS_TRIM_TIME hours success_logs_trim_time = settings.SUCCESS_LOGS_TRIM_TIME # keep errors for ALL_LOGS_TRIM_TIME days all_logs_trim_time = settings.ALL_LOGS_TRIM_TIME if success_logs_trim_time: success_log_later = timezone.now() - timedelta(hours=success_logs_trim_time) event_ids = WebHookEvent.objects.filter(created_on__lte=success_log_later, status=WebHookEvent.STATUS_COMPLETE) event_ids = event_ids.values_list("id", flat=True) for batch in chunk_list(event_ids, 1000): for event in WebHookEvent.objects.filter(id__in=batch): event.release() if all_logs_trim_time: all_log_later = timezone.now() - timedelta(hours=all_logs_trim_time) event_ids = WebHookEvent.objects.filter(created_on__lte=all_log_later) event_ids = event_ids.values_list("id", flat=True) for batch in chunk_list(event_ids, 1000): for event in WebHookEvent.objects.filter(id__in=batch): event.release()
def migrate_flows(min_version=None): # pragma: no cover to_version = min_version or get_current_export_version() # get all flows below the min version old_versions = Flow.get_versions_before(to_version) flows_to_migrate = Flow.objects.filter(is_active=True, version_number__in=old_versions) flow_ids = list(flows_to_migrate.values_list("id", flat=True)) total = len(flow_ids) if not total: print("All flows up to date") return True print("Found %d flows to migrate to %s..." % (len(flow_ids), to_version)) num_updated = 0 errored = [] for id_batch in chunk_list(flow_ids, 1000): for flow in Flow.objects.filter(id__in=id_batch): try: flow.ensure_current_version(min_version=to_version) num_updated += 1 except Exception: print("Unable to migrate flow '%s' (#%d)" % (flow.name, flow.id)) errored.append(flow) print(" > Flows migrated: %d of %d (%d errored)" % (num_updated, total, len(errored))) if errored: print(" > Errored flows: %s" % (", ".join([str(e.id) for e in errored]))) return len(errored) == 0
def migrate_flows(min_version=None): # pragma: no cover to_version = min_version or Flow.FINAL_LEGACY_VERSION # get all flows below the min version old_versions = Flow.get_versions_before(to_version) flows_to_migrate = Flow.objects.filter(is_active=True, version_number__in=old_versions) flow_ids = list(flows_to_migrate.values_list("id", flat=True)) total = len(flow_ids) if not total: print("All flows up to date") return True print(f"Found {len(flow_ids)} flows to migrate to {to_version}...") num_updated = 0 num_errored = 0 for id_batch in chunk_list(flow_ids, 1000): for flow in Flow.objects.filter(id__in=id_batch): try: flow.ensure_current_version(min_version=to_version) num_updated += 1 except Exception: print(f"Unable to migrate flow '{flow.name}' ({str(flow.uuid)}):") print(traceback.format_exc()) num_errored += 1 print(f" > Flows migrated: {num_updated} of {total} ({num_errored} errored)") return num_errored == 0
def populate_recent_runs(FlowPathRecentMessage, FlowPathRecentRun): recent_msgs = FlowPathRecentMessage.objects.order_by("id") # don't convert any records that have already been added for new runs new_for_run = FlowPathRecentRun.objects.order_by("id").first() if new_for_run: recent_msgs = recent_msgs.filter(created_on__lt=new_for_run.visited_on) recent_msgs_count = recent_msgs.count() if not recent_msgs_count: return num_converted = 0 for recent_msg_batch in chunk_list( recent_msgs.using("direct").iterator(), 1000): with transaction.atomic(): for recent_msg in recent_msg_batch: FlowPathRecentRun.objects.create( from_uuid=recent_msg.from_uuid, to_uuid=recent_msg.to_uuid, run=recent_msg.run, visited_on=recent_msg.created_on, ) num_converted += len(recent_msg_batch) print(" > Converted %d of %d recent messages to recent runs" % (num_converted, recent_msgs_count))
def handle(self, file_path: str, batch_size: int, tps: int, *args, **options): with open(file_path) as id_file: msg_ids = [int(line) for line in id_file.readlines() if line] msg_ids = sorted(msg_ids) self.stdout.write(f"> loaded {len(msg_ids)} msg ids from {file_path}") num_batches = math.ceil(len(msg_ids) / batch_size) batch_send_time = int(batch_size / tps) # estimated time to send a batch in seconds batch_num = 0 next_attempt = timezone.now() self.stdout.write( f"> estimated batch send time of {batch_send_time} seconds at {tps} TPS" ) for id_batch in chunk_list(msg_ids, batch_size): # only fetch messages which are WIRED and have never errored batch = Msg.objects.filter(id__in=id_batch, status=Msg.STATUS_WIRED, error_count=0) num_updated = batch.update(status=Msg.STATUS_ERRORED, error_count=1, next_attempt=next_attempt) self.stdout.write( f"> batch {batch_num+1}/{num_batches}" f" - dewired {num_updated} msg ids, next_attempt={next_attempt.isoformat()}" ) batch_num += 1 next_attempt = next_attempt + timedelta(seconds=batch_send_time)
def populate_responded_batch(batch_size, Msg, FlowRun): # grab ids of a batch of runs with null responded run_ids = FlowRun.objects.filter(responded=None) run_ids = list(run_ids.values_list('pk', flat=True)[:batch_size]) if not run_ids: return 0, 0 print "Fetched ids of %d runs with no responded value..." % len(run_ids) total_with, total_without = 0, 0 for batch_ids in chunk_list(run_ids, UPDATE_BATCH_SIZE): batch_ids = list(batch_ids) # which of the runs in this batch have responses? msg_responses = Msg.objects.filter(direction='I', steps__run__pk__in=batch_ids) with_responses = msg_responses.values_list('steps__run', flat=True) with_responses = set(with_responses) without_responses = [run_id for run_id in batch_ids if run_id not in with_responses] # update our batches of responded/un-responded if with_responses: FlowRun.objects.filter(pk__in=with_responses).update(responded=True) if without_responses: FlowRun.objects.filter(pk__in=without_responses).update(responded=False) total_with += len(with_responses) total_without += len(without_responses) print " > Updated %d of %d runs batch" % (total_with + total_without, len(run_ids)) return total_with, total_without
def __init__(self, records: List[Dict], max_payload_size: int = 256): # serialize records as a JSONL payload buffer = io.BytesIO() for record in records: buffer.write(json.dumps(record).encode("utf-8")) buffer.write(b"\n") payload = buffer.getvalue() payload_chunks = chunk_list(payload, size=max_payload_size) self.events = [{ "Records": { "Payload": chunk } } for chunk in payload_chunks] self.events.append( { "Stats": { "Details": { "BytesScanned": 123, "BytesProcessed": 234, "BytesReturned": len(payload) } } }, ) self.events.append({"End": {}})
def handle(self, start_id: int, event_types: list, dry_run: bool, quiet: bool, *args, **options): start = FlowStart.objects.filter(id=start_id).first() if not start: raise CommandError("no such flow start") undoers = {t: clazz(self.stdout) for t, clazz in UNDO_CLASSES.items() if not event_types or t in event_types} undo_types = ", ".join(sorted(undoers.keys())) if undoers else "no" desc = f"{undo_types} events for start #{start.id} of '{start.flow}' flow in the '{start.org.name}' workspace" if quiet: self.stdout.write(f"Undoing {desc}...") else: if input(f"Undo {desc}? [y/N]: ") != "y": return self.stdout.write("Fetching run ids... ", ending="") run_ids = list(start.runs.values_list("id", flat=True)) self.stdout.write(f"found {len(run_ids)}") num_fixed = 0 # process runs in batches for run_id_batch in chunk_list(run_ids, self.batch_size): run_batch = list(FlowRun.objects.filter(id__in=run_id_batch).only("id", "contact_id", "session_id")) self.undo_for_batch(run_batch, undoers, dry_run) num_fixed += len(run_batch) self.stdout.write(f" > Fixed {num_fixed} contacts") # print summaries of the undoers for undoer in undoers.values(): undoer.print_summary()
def bulk_exit(runs, exit_type, exited_on=None): from temba.flows.models import Flow, FlowRun if isinstance(runs, list): runs = [{'id': r.pk, 'flow_id': r.flow_id} for r in runs] else: runs = list(runs.values('id', 'flow_id')) # select only what we need... # organize runs by flow runs_by_flow = defaultdict(list) for run in runs: runs_by_flow[run['flow_id']].append(run['id']) # for each flow, remove activity for all runs for flow_id, run_ids in runs_by_flow.iteritems(): flow = Flow.objects.filter(id=flow_id).first() if flow: flow.remove_active_for_run_ids(run_ids) modified_on = timezone.now() if not exited_on: exited_on = modified_on from temba.flows.tasks import continue_parent_flows # batch this for 1,000 runs at a time so we don't grab locks for too long for batch in chunk_list(runs, 1000): ids = [r['id'] for r in batch] run_objs = FlowRun.objects.filter(pk__in=ids) run_objs.update(is_active=False, exited_on=exited_on, exit_type=exit_type, modified_on=modified_on) # continue the parent flows to continue async continue_parent_flows.delay(ids)
def migrate_flow_activity(Flow, FlowPathCount, FlowPathRecentMessage): """ Converts old path count and recent message records (rule_uuid/node_uuid -> node_uuid) to be (exit_uuid -> node_uuid) """ # start by ensuring all flows are at a minimum version (the one that added exit_uuid to actionsets) if not migrate_flows('10.4'): raise ValueError("Migration can't proceed because some flows couldn't be migrated") flow_ids = list(Flow.objects.filter(is_active=True).values_list('id', flat=True)) if not flow_ids: return print("Found %d active flows to migrate activity for..." % len(flow_ids)) num_updated = 0 for id_batch in chunk_list(flow_ids, 1000): flows = Flow.objects.filter(id__in=id_batch).prefetch_related('action_sets') for flow in flows: with transaction.atomic(): for action_set in flow.action_sets.all(): FlowPathCount.objects.filter(flow=flow, from_uuid=action_set.uuid).update(from_uuid=action_set.exit_uuid) FlowPathRecentMessage.objects.filter(from_uuid=action_set.uuid).update(from_uuid=action_set.exit_uuid) num_updated += 1 print(" > Updated %d of %d flows" % (num_updated, len(flow_ids)))
def apply_as_migration(apps, schema_editor): Flow = apps.get_model("flows", "Flow") FlowRun = apps.get_model("flows", "FlowRun") FlowStep = apps.get_model("flows", "FlowStep") flows = Flow.objects.filter(is_archived=True) for flow in flows: runs = FlowRun.objects.filter(is_active=True, exit_type=None, flow_id=flow.id) run_ids = list(runs.values_list("id", flat=True)) # batch this for 1,000 runs at a time so we don't grab locks for too long for id_batch in chunk_list(run_ids, 1000): now = timezone.now() # mark all steps in these runs as having been left FlowStep.objects.filter(run__id__in=id_batch, left_on=None).update(left_on=now) runs = FlowRun.objects.filter(id__in=id_batch) runs.update(is_active=False, exited_on=now, exit_type="I", modified_on=now)
def migrate_flows(): # pragma: no cover flows_to_migrate = (Flow.objects.filter(is_active=True).exclude( version_number=Flow.FINAL_LEGACY_VERSION).exclude( version_number=Flow.CURRENT_SPEC_VERSION)) flow_ids = list(flows_to_migrate.values_list("id", flat=True)) total = len(flow_ids) if not total: print("All flows up to date") return True print(f"Found {len(flow_ids)} flows to migrate...") num_updated = 0 num_errored = 0 for id_batch in chunk_list(flow_ids, 5000): for flow in Flow.objects.filter(id__in=id_batch): try: flow.ensure_current_version() num_updated += 1 except Exception: print( f"Unable to migrate flow[uuid={str(flow.uuid)} name={flow.name} created_on={flow.created_on.isoformat()}]':" ) print(traceback.format_exc()) num_errored += 1 print( f" > Flows migrated: {num_updated} of {total} ({num_errored} errored)" ) return num_errored == 0
def fix_flow_types(apps, schema_editor): Flow = apps.get_model("flows", "Flow") num_updated = 0 for batch in chunk_list(Flow.objects.filter(flow_type="F"), 1000): Flow.objects.filter(id__in=[f.id for f in batch]).update(flow_type="M") num_updated += len(batch) print(f" > Updated {num_updated} flows with type F to type M")
def trim(cls): """ Deletes all HTTP Logs older than 3 days, 1000 at a time """ cutoff = timezone.now() - timedelta(days=3) ids = HTTPLog.objects.filter(created_on__lte=cutoff).values_list("id", flat=True) for chunk in chunk_list(ids, 1000): HTTPLog.objects.filter(id__in=chunk).delete()
def refresh_whatsapp_contacts(channel_id): r = get_redis_connection() key = "refresh_whatsapp_contacts_%d" % channel_id # we can't use our non-overlapping task decorator as it creates a loop in the celery resolver when registering if r.get(key): # pragma: no cover return channel = Channel.objects.filter(id=channel_id, is_active=True).first() if not channel: # pragma: no cover return with r.lock(key, 3600): # look up all whatsapp URNs for this channel wa_urns = (ContactURN.objects.filter( org_id=channel.org_id, scheme=WHATSAPP_SCHEME, contact__is_stopped=False, contact__is_blocked=False).exclude(contact=None).only( "id", "path")) # 1,000 contacts at a time, we ask WhatsApp to look up our contacts based on the path refreshed = 0 for urn_batch in chunk_list(wa_urns, 1000): # need to wait 10 seconds between each batch of 1000 if refreshed > 0: # pragma: no cover time.sleep(10) # build a list of the fully qualified numbers we have contacts = ["+%s" % u.path for u in urn_batch] payload = {"blocking": "wait", "contacts": contacts} # go fetch our contacts headers = { "Authorization": "Bearer %s" % channel.config[Channel.CONFIG_AUTH_TOKEN] } url = channel.config[Channel.CONFIG_BASE_URL] + "/v1/contacts" start = timezone.now() resp = requests.post(url, json=payload, headers=headers) elapsed = (timezone.now() - start).total_seconds() * 1000 HTTPLog.create_from_response(HTTPLog.WHATSAPP_CONTACTS_REFRESHED, url, resp, channel=channel, request_time=elapsed) # if we had an error, break out if resp.status_code != 200: break refreshed += len(urn_batch) print("refreshed %d whatsapp urns for channel %d" % (refreshed, channel_id))
def trim_channel_log_task(): """ Trims old channel logs """ trim_before = timezone.now() - settings.RETENTION_PERIODS["channellog"] ids = ChannelLog.objects.filter(created_on__lte=trim_before).values_list("id", flat=True) for chunk in chunk_list(ids, 1000): ChannelLog.objects.filter(id__in=chunk).delete()
def populate_flowsteps_for_broadcast(RelatedBroadcast, RelatedMsg, MsgManager, broadcast, batch): msg_ids = MsgManager.filter(broadcast=broadcast.id).values_list('id', flat=True) start_count = len(batch) for msg_id_batch in chunk_list(set(msg_ids), 1000): fs_ids = set(RelatedMsg.objects.filter(msg_id__in=msg_id_batch).values_list('flowstep_id', flat=True)) broadcast_batch = [RelatedBroadcast(flowstep_id=fs_id, broadcast_id=broadcast.id) for fs_id in fs_ids] batch += broadcast_batch return len(batch) - start_count
def clear_next_attempt(apps, schema_editor): # pragma: no cover Msg = apps.get_model("msgs", "Msg") Channel = apps.get_model("channels", "Channel") android_ids = Channel.objects.filter(channel_type="A").values_list( "id", flat=True) for android_chunk in chunk_list(android_ids, 100): Msg.objects.filter(channel_id__in=android_chunk).update( next_attempt=None)
def trim_event_fires_task(): start = timezone.now() boundary = timezone.now() - timedelta(days=settings.EVENT_FIRE_TRIM_DAYS) trim_ids = EventFire.objects.filter(fired__lt=boundary).values_list( "id", flat=True).order_by("fired")[:100000] for batch in chunk_list(trim_ids, 100): # use a bulk delete for performance reasons, nothing references EventFire EventFire.objects.filter(id__in=batch).delete() print(f"Deleted {len(trim_ids)} event fires in {timezone.now()-start}")
def do_populate_send_all(Broadcast): broadcast_ids = Broadcast.objects.all().values_list('id', flat=True) broadcast_count = len(broadcast_ids) if broadcast_count: print('Starting to update %d broadcasts send all field...' % broadcast_count) updated = 0 for chunk in chunk_list(broadcast_ids, 5000): Broadcast.objects.filter(pk__in=chunk).update(send_all=False) print("Updated %d of %d broadcasts" % (updated + len(chunk), broadcast_count))
def populate_is_system(apps, schema_editor): Flow = apps.get_model("flows", "Flow") total = Flow.objects.filter(is_system=None).count() if total: print(f"Updating is_system on {total} flows...") num_updated = 0 for batch in chunk_list(Flow.objects.filter(is_system=None).exclude(flow_type="M"), 1000): Flow.objects.filter(id__in=[f.id for f in batch]).update(is_system=False) num_updated += len(batch) print(f" > Updated {num_updated} of {total} flows") for batch in chunk_list(Flow.objects.filter(is_system=None).filter(flow_type="M"), 1000): Flow.objects.filter(id__in=[f.id for f in batch]).update(is_system=True) num_updated += len(batch) print(f" > Updated {num_updated} of {total} flows")
def release_contacts(user_id, contact_ids): """ Releases the given contacts """ user = User.objects.get(pk=user_id) for id_batch in chunk_list(contact_ids, 100): batch = Contact.objects.filter(id__in=id_batch, is_active=True).prefetch_related("urns") for contact in batch: contact.release(user)
def backfill_flowsteps(FlowStep, Broadcast, MsgManager): # we keep track of our completed broadcasts so we can pick up where we left off if interrupted r = get_redis_connection() highpoint = r.get(HIGHPOINT_KEY) if highpoint is None: highpoint = 0 RelatedBroadcast = FlowStep.broadcasts.through RelatedMsg = FlowStep.messages.through broadcast_ids = Broadcast.objects.filter( id__gt=highpoint).order_by('id').values_list('id', flat=True) start = time.time() batch = [] i = 0 for broadcast_id_batch in chunk_list(broadcast_ids, 1000): broadcasts = Broadcast.objects.filter( id__in=broadcast_id_batch).order_by('id').only('id') for broadcast in broadcasts: i += 1 # clear any current relations on this broadcast RelatedBroadcast.objects.filter(broadcast_id=broadcast.id).delete() populate_flowsteps_for_broadcast(RelatedBroadcast, RelatedMsg, MsgManager, broadcast, batch) if len(batch) > 1000: for broadcast_batch in chunk_list(batch, 1000): RelatedBroadcast.objects.bulk_create(broadcast_batch) r.set(HIGHPOINT_KEY, broadcast.id) batch = [] print "Processed %d / %d (batch size %d) in %d" % ( i, len(broadcast_ids), len(batch), int(time.time() - start)) for broadcast_batch in chunk_list(batch, 1000): RelatedBroadcast.objects.bulk_create(broadcast_batch) # we finished, no need to track any more status r.delete(HIGHPOINT_KEY)
def backfill_urn_identity(apps, schema_editor): ContactURN = apps.get_model('contacts', 'ContactURN') urns = ContactURN.objects.filter(identity=None).values_list('id', flat=True) count = 0 print("found %d urns to backfill" % len(urns)) for batch in chunk_list(urns, 1000): ContactURN.objects.filter(id__in=batch).update(identity=F('urn')) count += len(batch) print("backfilled %d of %d URNs" % (count, len(urns)))
def trim_webhook_event_task(): """ Trims old webhook events """ if settings.RETENTION_PERIODS["webhookevent"]: trim_before = timezone.now( ) - settings.RETENTION_PERIODS["webhookevent"] event_ids = WebHookEvent.objects.filter( created_on__lte=trim_before).values_list("id", flat=True) for batch in chunk_list(event_ids, 1000): WebHookEvent.objects.filter(id__in=batch).delete()
def delete_inactive_channelevents(apps, schema_editor): ChannelEvent = apps.get_model('channels', 'ChannelEvent') # delete all channel events that are inactive, we don't care to keep those around ids = ChannelEvent.objects.filter(is_active=False).values_list('id', flat=True) print("Found %d channel events to delete" % len(ids)) count = 0 for chunk in chunk_list(ids, 1000): ChannelEvent.objects.filter(id__in=chunk).delete() count += len(chunk) print("Deleted %d" % count)
def refresh_whatsapp_contacts(channel_id): r = get_redis_connection() key = 'refresh_whatsapp_contacts_%d' % channel_id # we can't use our non-overlapping task decorator as it creates a loop in the celery resolver when registering if r.get(key): # pragma: no cover return channel = Channel.objects.filter(id=channel_id, is_active=True).first() if not channel: # pragma: no cover return with r.lock(key, 3600): # look up all whatsapp URNs for this channel wa_urns = (ContactURN.objects.filter( org_id=channel.org_id, scheme=WHATSAPP_SCHEME, contact__is_stopped=False, contact__is_blocked=False).exclude(contact=None).only( 'id', 'path')) # 1,000 contacts at a time, we ask WhatsApp to look up our contacts based on the path refreshed = 0 for urn_batch in chunk_list(wa_urns, 1000): # need to wait 10 seconds between each batch of 1000 if refreshed > 0: # pragma: no cover time.sleep(10) # build a list of the fully qualified numbers we have contacts = ["+%s" % u.path for u in urn_batch] payload = {"blocking": "wait", "contacts": contacts} # go fetch our contacts headers = { "Authorization": "Bearer %s" % channel.config[Channel.CONFIG_AUTH_TOKEN] } resp = requests.post(channel.config[Channel.CONFIG_BASE_URL] + '/v1/contacts', json=payload, headers=headers) # if we had an error, break out if resp.status_code != 200 or resp.json().get('error', True): raise Exception("Received error refreshing contacts for %d", channel.id) refreshed += len(urn_batch) print("refreshed %d whatsapp urns for channel %d" % (refreshed, channel_id))
def clear_old_msg_external_ids(): """ Clears external_id on older messages to reduce the size of the index on that column. External ids aren't surfaced anywhere and are only used for debugging channel issues, so are of limited usefulness on older messages. """ threshold = timezone.now() - timedelta(days=30) # 30 days ago msg_ids = list(Msg.objects.filter(created_on__lt=threshold).exclude(external_id=None).values_list('id', flat=True)) for msg_id_batch in chunk_list(msg_ids, 1000): Msg.objects.filter(id__in=msg_id_batch).update(external_id=None) print("Cleared external ids on %d messages" % len(msg_ids))
def migrate_duration_extra(apps, schema_editor): ChannelEvent = apps.get_model('channels', 'ChannelEvent') # find all events with a duration and convert them to extra ids = ChannelEvent.objects.filter(duration__gte=0).values_list('id', flat=True) if ids: print("Found %d channel events to set extra on" % len(ids)) count = 0 for chunk in chunk_list(ids, 250): ChannelEvent.objects.filter(id__in=chunk).update(extra=Concat(Value('{"duration":'), F('duration'), Value('}'), output_field=TextField())) count += len(chunk) print("Updated %d" % count)
def clear_old_msg_external_ids(): """ Clears external_id on older messages to reduce the size of the index on that column. External ids aren't surfaced anywhere and are only used for debugging channel issues, so are of limited usefulness on older messages. """ threshold = timezone.now() - timedelta(days=30) # 30 days ago msg_ids = list(Msg.objects.filter(created_on__lt=threshold).exclude(external_id=None).values_list('id', flat=True)) for msg_id_batch in chunk_list(msg_ids, 1000): Msg.objects.filter(pk__in=msg_id_batch).update(external_id=None) print("Cleared external ids on %d messages" % len(msg_ids))
def derive_opt_outs(apps, schema_editor): from temba.contacts.models import Contact, ContactGroup # remap our group types to reflect failed becoming stopped ContactGroup.system_groups.filter(group_type='F').update(group_type='S') # now unstop any contacts that belong to groups, these are temporary failures failed_ids = Contact.objects.filter(is_active=True, is_stopped=True, all_groups__group_type='U').distinct().values_list('id', flat=True) for chunk_ids in chunk_list(failed_ids, 100): contacts = Contact.objects.filter(id__in=chunk_ids) for contact in contacts: contact.unstop(contact.modified_by) print "unstopped: %d" % contact.id
def update_session_wait_expires(flow_id): """ Update the wait_expires_on of any session currently waiting in the given flow """ flow = Flow.objects.get(id=flow_id) session_ids = flow.sessions.filter( status=FlowSession.STATUS_WAITING).values_list("id", flat=True) for id_batch in chunk_list(session_ids, 1000): batch = FlowSession.objects.filter(id__in=id_batch) batch.update(wait_expires_on=F("wait_started_on") + timedelta(minutes=flow.expires_after_minutes))
def populate_exit_type_batch(batch_size, FlowRun, FlowStep, ActionSet): # grab ids of a batch of inactive runs with no exit type exited_run_ids = FlowRun.objects.filter(is_active=False, exit_type=None) exited_run_ids = list(exited_run_ids.values_list('pk', flat=True)[:batch_size]) if not exited_run_ids: return 0 print "Fetched ids of %d potentially expired, completed or stopped runs" % len(exited_run_ids) # grab UUIDs of all terminal action sets for quick lookups terminal_nodes = set([n['uuid'] for n in ActionSet.objects.filter(destination=None).values('uuid')]) if terminal_nodes: print "Cached %d terminal nodes for run completion calculation" % len(terminal_nodes) # pre-fetch required for completion calculation steps_prefetch = Prefetch('steps', queryset=FlowStep.objects.order_by('arrived_on')) num_updated = 0 for batch_ids in chunk_list(exited_run_ids, UPDATE_BATCH_SIZE): completed_ids = [] interrupted_ids = [] expired_ids = [] for run in FlowRun.objects.filter(pk__in=batch_ids).prefetch_related(steps_prefetch): # get last step in this run steps = list(run.steps.all()) last_step = steps[len(steps) - 1] if len(steps) > 0 else None if last_step and step_is_terminal(last_step, terminal_nodes): completed_ids.append(run.pk) elif run.exited_on: expired_ids.append(run.pk) else: interrupted_ids.append(run.pk) # update our batches of completed/interrupted/expired, using modified_on as approximate exited_on if completed_ids: FlowRun.objects.filter(pk__in=completed_ids).update(exited_on=F('modified_on'), exit_type='C') if interrupted_ids: FlowRun.objects.filter(pk__in=interrupted_ids).update(exited_on=F('modified_on'), exit_type='I') if expired_ids: FlowRun.objects.filter(pk__in=expired_ids).update(exit_type='E') num_updated += len(completed_ids) + len(interrupted_ids) + len(expired_ids) print " > Updated %d of %d runs" % (num_updated, len(exited_run_ids)) return len(exited_run_ids)
def backfill_flowsteps(FlowStep, Broadcast, MsgManager): # we keep track of our completed broadcasts so we can pick up where we left off if interrupted r = get_redis_connection() highpoint = r.get(HIGHPOINT_KEY) if highpoint is None: highpoint = 0 RelatedBroadcast = FlowStep.broadcasts.through RelatedMsg = FlowStep.messages.through broadcast_ids = Broadcast.objects.filter(id__gt=highpoint).order_by('id').values_list('id', flat=True) start = time.time() batch = [] i = 0 for broadcast_id_batch in chunk_list(broadcast_ids, 1000): broadcasts = Broadcast.objects.filter(id__in=broadcast_id_batch).order_by('id').only('id') for broadcast in broadcasts: i += 1 # clear any current relations on this broadcast RelatedBroadcast.objects.filter(broadcast_id=broadcast.id).delete() populate_flowsteps_for_broadcast(RelatedBroadcast, RelatedMsg, MsgManager, broadcast, batch) if len(batch) > 1000: for broadcast_batch in chunk_list(batch, 1000): RelatedBroadcast.objects.bulk_create(broadcast_batch) r.set(HIGHPOINT_KEY, broadcast.id) batch = [] print "Processed %d / %d (batch size %d) in %d" % (i, len(broadcast_ids), len(batch), int(time.time() - start)) for broadcast_batch in chunk_list(batch, 1000): RelatedBroadcast.objects.bulk_create(broadcast_batch) # we finished, no need to track any more status r.delete(HIGHPOINT_KEY)
def populate_is_system(apps, schema_editor): Flow = apps.get_model("flows", "Flow") total = Flow.objects.filter(is_system=None).count() if total: print(f"Updating is_system on {total} flows...") num_updated = 0 for batch in chunk_list( Flow.objects.filter(is_system=None).exclude(flow_type="M"), 1000): Flow.objects.filter(id__in=[f.id for f in batch]).update(is_system=False) num_updated += len(batch) print(f" > Updated {num_updated} of {total} flows") for batch in chunk_list( Flow.objects.filter(is_system=None).filter(flow_type="M"), 1000): Flow.objects.filter(id__in=[f.id for f in batch]).update(is_system=True) num_updated += len(batch) print(f" > Updated {num_updated} of {total} flows")
def trim_flow_starts(): """ Cleanup completed non-user created flow starts """ trim_before = timezone.now() - settings.RETENTION_PERIODS["flowstart"] num_deleted = 0 start = timezone.now() logger.info( f"Deleting completed non-user created flow starts created before {trim_before.isoformat()}" ) while True: start_ids = list( FlowStart.objects.filter( created_by=None, status__in=(FlowStart.STATUS_COMPLETE, FlowStart.STATUS_FAILED), modified_on__lte=trim_before, ).values_list("id", flat=True)[:1000]) if not start_ids: break # detach any flows runs that belong to these starts run_ids = FlowRun.objects.filter(start_id__in=start_ids).values_list( "id", flat=True)[:100000] while len(run_ids) > 0: for chunk in chunk_list(run_ids, 1000): FlowRun.objects.filter(id__in=chunk).update(start_id=None) # reselect for our next batch run_ids = FlowRun.objects.filter( start_id__in=start_ids).values_list("id", flat=True)[:100000] FlowStart.contacts.through.objects.filter( flowstart_id__in=start_ids).delete() FlowStart.groups.through.objects.filter( flowstart_id__in=start_ids).delete() FlowStartCount.objects.filter(start_id__in=start_ids).delete() FlowStart.objects.filter(id__in=start_ids).delete() num_deleted += len(start_ids) if num_deleted % 10000 == 0: # pragma: no cover logger.debug(f" > Deleted {num_deleted} flow starts") logger.info( f"Deleted {num_deleted} completed non-user created flow starts in {timesince(start)}" )
def derive_opt_outs(apps, schema_editor): from temba.contacts.models import Contact, ContactGroup # remap our group types to reflect failed becoming stopped ContactGroup.system_groups.filter(group_type='F').update( group_type='S') # now unstop any contacts that belong to groups, these are temporary failures failed_ids = Contact.objects.filter( is_active=True, is_stopped=True, all_groups__group_type='U').distinct().values_list('id', flat=True) for chunk_ids in chunk_list(failed_ids, 100): contacts = Contact.objects.filter(id__in=chunk_ids) for contact in contacts: contact.unstop(contact.modified_by) print "unstopped: %d" % contact.id
def refresh_whatsapp_contacts(channel_id): r = get_redis_connection() key = "refresh_whatsapp_contacts_%d" % channel_id # we can't use our non-overlapping task decorator as it creates a loop in the celery resolver when registering if r.get(key): # pragma: no cover return channel = Channel.objects.filter(id=channel_id, is_active=True).first() if not channel: # pragma: no cover return with r.lock(key, 3600): # look up all whatsapp URNs for this channel wa_urns = ( ContactURN.objects.filter( org_id=channel.org_id, scheme=WHATSAPP_SCHEME, contact__is_stopped=False, contact__is_blocked=False ) .exclude(contact=None) .only("id", "path") ) # 1,000 contacts at a time, we ask WhatsApp to look up our contacts based on the path refreshed = 0 for urn_batch in chunk_list(wa_urns, 1000): # need to wait 10 seconds between each batch of 1000 if refreshed > 0: # pragma: no cover time.sleep(10) # build a list of the fully qualified numbers we have contacts = ["+%s" % u.path for u in urn_batch] payload = {"blocking": "wait", "contacts": contacts} # go fetch our contacts headers = {"Authorization": "Bearer %s" % channel.config[Channel.CONFIG_AUTH_TOKEN]} resp = requests.post( channel.config[Channel.CONFIG_BASE_URL] + "/v1/contacts", json=payload, headers=headers ) # if we had an error, break out if resp.status_code != 200: raise Exception("Received error refreshing contacts for %d", channel.id) refreshed += len(urn_batch) print("refreshed %d whatsapp urns for channel %d" % (refreshed, channel_id))
def populate_recipients_for_broadcast(Broadcast, MsgManager, broadcast_id): """ Populates the recipients for the passed in broadcast, we just select all the msgs for this broadcast, then populate the recipients based on the URNs of those messages """ urn_ids = MsgManager.filter(broadcast=broadcast_id).values_list("contact_urn_id", flat=True) # clear any current recipients, we are rebuilding RelatedRecipients = Broadcast.recipients.through Broadcast.objects.get(id=broadcast_id).recipients.clear() for urn_batch in chunk_list(set(urn_ids), 1000): recipient_batch = [RelatedRecipients(contacturn_id=u, broadcast_id=broadcast_id) for u in urn_batch] RelatedRecipients.objects.bulk_create(recipient_batch) return len(urn_ids)
def populate_recipients_for_broadcast(Broadcast, MsgManager, broadcast_id): """ Populates the recipients for the passed in broadcast, we just select all the msgs for this broadcast, then populate the recipients based on the contacts of those messages """ contact_ids = MsgManager.filter(broadcast=broadcast_id).values_list('contact_id', flat=True) contact_ids = set([c for c in contact_ids if c is not None]) # clear any current recipients, we are rebuilding RelatedRecipients = Broadcast.recipients.through Broadcast.objects.get(id=broadcast_id).recipients.clear() for contact_id_batch in chunk_list(contact_ids, 1000): recipient_batch = [RelatedRecipients(contact_id=c, broadcast_id=broadcast_id) for c in contact_id_batch] RelatedRecipients.objects.bulk_create(recipient_batch) return len(contact_ids)
def check_campaigns_task(): """ See if any event fires need to be triggered """ from temba.flows.models import Flow unfired = EventFire.objects.filter( fired=None, scheduled__lte=timezone.now(), event__flow__flow_server_enabled=False ).select_related("event") unfired = unfired.values("id", "event_id", "event__flow_id") # group fire events by event so they can be batched fire_ids_by_event_id = defaultdict(list) event_flow_map = dict() for fire in unfired: event_flow_map[fire["event_id"]] = fire["event__flow_id"] fire_ids_by_event_id[fire["event_id"]].append(fire["id"]) # fetch the flows used by all these event fires flows_by_id = {flow.id: flow for flow in Flow.objects.filter(id__in=event_flow_map.values())} queued_fires = QueueRecord("queued_event_fires") # create queued tasks for ev_id, fire_ids in fire_ids_by_event_id.items(): flow_id = event_flow_map[ev_id] flow = flows_by_id[flow_id] # create sub-batches no no single task is too big for fire_id_batch in chunk_list(fire_ids, 500): # ignore any fires which were queued by previous calls to this task but haven't yet been marked as fired queued_fire_ids = queued_fires.filter_unqueued(fire_id_batch) if queued_fire_ids: try: push_task( flow.org_id, Queue.HANDLER, HANDLE_EVENT_TASK, dict(type=FIRE_EVENT, fires=queued_fire_ids) ) queued_fires.set_queued(queued_fire_ids) except Exception: # pragma: no cover fire_ids_str = ",".join(str(f) for f in queued_fire_ids) logger.error("Error queuing campaign event fires: %s" % fire_ids_str, exc_info=True)
def migrate_from_calls(apps, schema_editor): Call = apps.get_model('msgs', 'Call') ChannelEvent = apps.get_model('channels', 'ChannelEvent') ContactURN = apps.get_model('contacts', 'ContactURN') call_ids = list(Call.objects.values_list('pk', flat=True)) num_created = 0 num_without_urn = 0 urn_prefetch = Prefetch('contact__urns', ContactURN.objects.filter(scheme='tel')) for call_id_batch in chunk_list(call_ids, 1000): call_batch = list(Call.objects.filter(pk__in=call_id_batch).prefetch_related(urn_prefetch)) event_batch = [] for call in call_batch: contact_urns = list(call.contact.urns.all()) call_urn = contact_urns[0] if contact_urns else None if not call_urn: num_without_urn += 1 event_batch.append(ChannelEvent(event_type=call.call_type, time=call.time, duration=call.duration, created_on=call.created_on, is_active=call.is_active, channel_id=call.channel_id, contact_id=call.contact_id, contact_urn=call_urn, org_id=call.org_id)) ChannelEvent.objects.bulk_create(event_batch) num_created += len(event_batch) print(" > Migrated %d of %d calls" % (num_created, len(call_ids))) if num_created: print("Migrated %d calls to channel events (couldn't find URN for %d)" % (num_created, num_without_urn))
def exit_active_flowruns(Contact, log=False): from temba.flows.models import FlowRun exit_runs = [] # find all contacts that have more than one active run active_contact_ids = Contact.objects.filter(runs__is_active=True).order_by('id')\ .annotate(run_count=Count('id')).filter(run_count__gt=1).values_list('id', flat=True) if log: print "%d contacts to evaluate runs for" % len(active_contact_ids) for idx, contact_id in enumerate(active_contact_ids): active_runs = FlowRun.objects.filter(contact_id=contact_id, is_active=True).order_by('-modified_on') # more than one? we may need to expire some if len(active_runs) > 1: last = active_runs[0] contact_exit_runs = [r.id for r in active_runs[1:]] ancestor = last.parent while ancestor: exit_runs.remove(ancestor.id) ancestor = ancestor.parent exit_runs += contact_exit_runs if (idx % 100) == 0: if log: print " - %d / %d contacts evaluated. %d runs to exit" % (idx, len(active_contact_ids), len(exit_runs)) # ok, now exit those runs exited = 0 for batch in chunk_list(exit_runs, 1000): runs = FlowRun.objects.filter(id__in=batch) FlowRun.bulk_exit(runs, FlowRun.EXIT_TYPE_INTERRUPTED, timezone.now()) exited += len(batch) if log: print " * %d / %d runs exited." % (exited, len(exit_runs))
def trim_event_fires_task(): start = timezone.now() boundary = timezone.now() - timedelta(days=settings.EVENT_FIRE_TRIM_DAYS) # first look for unfired fires that belong to inactive events trim_ids = list( EventFire.objects.filter(fired=None, event__is_active=False).values_list("id", flat=True)[:EVENT_FIRES_TO_TRIM] ) # if we have trimmed all of our unfired inactive fires, look for old fired ones if len(trim_ids) < EVENT_FIRES_TO_TRIM: trim_ids += list( EventFire.objects.filter(fired__lt=boundary) .values_list("id", flat=True) .order_by("fired")[: EVENT_FIRES_TO_TRIM - len(trim_ids)] ) for batch in chunk_list(trim_ids, 100): # use a bulk delete for performance reasons, nothing references EventFire EventFire.objects.filter(id__in=batch).delete() print(f"Deleted {len(trim_ids)} event fires in {timezone.now()-start}")
def resolve_twitter_ids(): r = get_redis_connection() # TODO: we can't use our non-overlapping task decorator as it creates a loop in the celery resolver when registering if r.get("resolve_twitter_ids_task"): # pragma: no cover return with r.lock("resolve_twitter_ids_task", 1800): # look up all 'twitter' URNs, limiting to 30k since that's the most our API would allow anyways twitter_urns = ContactURN.objects.filter( scheme=TWITTER_SCHEME, contact__is_stopped=False, contact__is_blocked=False ).exclude(contact=None) twitter_urns = twitter_urns[:30000].only("id", "org", "contact", "path") api_key = settings.TWITTER_API_KEY api_secret = settings.TWITTER_API_SECRET client = Twython(api_key, api_secret) updated = 0 print("found %d twitter urns to resolve" % len(twitter_urns)) # contacts we will stop stop_contacts = [] # we try to look these up 100 at a time for urn_batch in chunk_list(twitter_urns, 100): screen_names = [u.path for u in urn_batch] screen_map = {u.path: u for u in urn_batch} # try to fetch our users by screen name try: resp = client.lookup_user(screen_name=",".join(screen_names)) for twitter_user in resp: screen_name = twitter_user["screen_name"].lower() twitter_id = twitter_user["id"] if screen_name in screen_map and twitter_user["id"]: twitterid_urn = URN.normalize(URN.from_twitterid(twitter_id, screen_name)) old_urn = screen_map[screen_name] # create our new contact URN new_urn = ContactURN.get_or_create(old_urn.org, old_urn.contact, twitterid_urn) # if our new URN already existed for another contact and it is newer # than our old contact, reassign it to the old contact if ( new_urn.contact != old_urn.contact and new_urn.contact.created_on > old_urn.contact.created_on ): new_urn.contact = old_urn.contact new_urn.save(update_fields=["contact"]) # get rid of our old URN ContactURN.objects.filter(id=old_urn.id).update(contact=None) del screen_map[screen_name] updated += 1 except Exception as e: # if this wasn't an exception caused by not finding any of the users, then break if str(e).find("No user matches") < 0: # exit, we'll try again later print("exiting resolve_twitter_ids due to exception: %s" % e) break # add all remaining contacts to the contacts we will stop for contact in screen_map.values(): stop_contacts.append(contact) # stop all the contacts we couldn't resolve that have only a twitter URN stopped = 0 for contact_urn in stop_contacts: contact = contact_urn.contact if len(contact.urns.all()) == 1: contact.stop(contact.created_by) stopped += 1 if len(twitter_urns) > 0: print("updated %d twitter urns, %d stopped" % (updated, len(stop_contacts)))
def create_contacts(self, orgs, locations, num_contacts): """ Creates test and regular contacts for this database. Returns tuples of org, contact id and the preferred urn id to avoid trying to hold all contact and URN objects in memory. """ group_counts = defaultdict(int) self._log("Creating %d test contacts..." % (len(orgs) * len(USERS))) for org in orgs: test_contacts = [] for user in org.cache["users"]: test_contacts.append(Contact.get_test_contact(user)) org.cache["test_contacts"] = test_contacts self._log(self.style.SUCCESS("OK") + "\n") self._log("Creating %d regular contacts...\n" % num_contacts) # disable table triggers to speed up insertion and in the case of contact group m2m, avoid having an unsquashed # count row for every contact with DisableTriggersOn(Contact, ContactURN, ContactGroup.contacts.through): names = [("%s %s" % (c1, c2)).strip() for c2 in CONTACT_NAMES[1] for c1 in CONTACT_NAMES[0]] names = [n if n else None for n in names] batch_num = 1 for index_batch in chunk_list(range(num_contacts), self.batch_size): batch = [] # generate flat representations and contact objects for this batch for c_index in index_batch: # pragma: no cover org = self.random_org(orgs) name = self.random_choice(names) location = self.random_choice(locations) if self.probability(CONTACT_HAS_FIELD_PROB) else None created_on = self.timeline_date(c_index / num_contacts) c = { "org": org, "user": org.cache["users"][0], "name": name, "groups": [], "tel": "+2507%08d" % c_index if self.probability(CONTACT_HAS_TEL_PROB) else None, "twitter": "%s%d" % (name.replace(" ", "_").lower() if name else "tweep", c_index) if self.probability(CONTACT_HAS_TWITTER_PROB) else None, "gender": self.random_choice(("M", "F")) if self.probability(CONTACT_HAS_FIELD_PROB) else None, "age": self.random.randint(16, 80) if self.probability(CONTACT_HAS_FIELD_PROB) else None, "joined": self.random_date() if self.probability(CONTACT_HAS_FIELD_PROB) else None, "ward": location[0] if location else None, "district": location[1] if location else None, "state": location[2] if location else None, "language": self.random_choice(CONTACT_LANGS), "is_stopped": self.probability(CONTACT_IS_STOPPED_PROB), "is_blocked": self.probability(CONTACT_IS_BLOCKED_PROB), "is_active": self.probability(1 - CONTACT_IS_DELETED_PROB), "created_on": created_on, "modified_on": self.random_date(created_on, self.db_ends_on), } c["fields_as_json"] = {} if c["gender"] is not None: c["fields_as_json"][str(org.cache["fields"]["gender"].uuid)] = {"text": str(c["gender"])} if c["age"] is not None: c["fields_as_json"][str(org.cache["fields"]["age"].uuid)] = { "text": str(c["age"]), "number": str(c["age"]), } if c["joined"] is not None: c["fields_as_json"][str(org.cache["fields"]["joined"].uuid)] = { "text": org.format_datetime(c["joined"], show_time=False), "datetime": timezone.localtime(c["joined"], org.timezone).isoformat(), } if location: c["fields_as_json"].update( { str(org.cache["fields"]["ward"].uuid): { "text": str(c["ward"].path.split(" > ")[-1]), "ward": c["ward"].path, "district": c["district"].path, "state": c["state"].path, }, str(org.cache["fields"]["district"].uuid): { "text": str(c["district"].path.split(" > ")[-1]), "district": c["district"].path, "state": c["state"].path, }, str(org.cache["fields"]["state"].uuid): { "text": str(c["state"].path.split(" > ")[-1]), "state": c["state"].path, }, } ) # work out which system groups this contact belongs to if c["is_active"]: if not c["is_blocked"] and not c["is_stopped"]: c["groups"].append(org.cache["system_groups"][ContactGroup.TYPE_ALL]) if c["is_blocked"]: c["groups"].append(org.cache["system_groups"][ContactGroup.TYPE_BLOCKED]) if c["is_stopped"]: c["groups"].append(org.cache["system_groups"][ContactGroup.TYPE_STOPPED]) # let each user group decide if it is taking this contact for g in org.cache["groups"]: if g.member(c) if callable(g.member) else self.probability(g.member): c["groups"].append(g) # track changes to group counts for g in c["groups"]: group_counts[g] += 1 batch.append(c) self._create_contact_batch(batch) self._log(" > Created batch %d of %d\n" % (batch_num, max(num_contacts // self.batch_size, 1))) batch_num += 1 # create group count records manually counts = [] for group, count in group_counts.items(): counts.append(ContactGroupCount(group=group, count=count, is_squashed=True)) group.count = count ContactGroupCount.objects.bulk_create(counts)
def add_deps(Flow, ActionSet, RuleSet): # constants FlowFlowDeps = Flow.flow_dependencies.through startFlowActionType = "flow" triggerFlowActionType = "trigger-flow" rulesetTypeSubflow = "subflow" start_time = time.monotonic() print("Collecting flows and dependencies...") # inactive flows have their deps cleared out, we only check active flows valid_flows = Flow.objects.filter(is_active=True).values_list("id", "uuid") valid_flow_map = dict() flow_ids = list() for valid_flow in valid_flows: flow_id, flow_uuid = valid_flow valid_flow_map[flow_uuid] = flow_id flow_ids.append(flow_id) total_flows = len(flow_ids) processed_flows = 0 expected_flow_deps = defaultdict(set) print("Processing flow dependencies...") for flow_ids_chunk in chunk_list(flow_ids, 1000): chunk_start_time = time.monotonic() actionsets = ( ActionSet.objects.filter(flow_id__in=flow_ids_chunk) .values("flow_id") .annotate(actions=ArrayAgg("actions")) ) for actionset in actionsets: flow_id = actionset["flow_id"] actionset_actions = actionset["actions"] for action_list in actionset_actions: for action in action_list: # if action["type"] == startFlowActionType: flow_uuid = action["flow"]["uuid"] # there might be some inactive flows listed as dependencies, ignore if flow_uuid in valid_flow_map: expected_flow_deps[flow_id].add(valid_flow_map[flow_uuid]) if action["type"] == triggerFlowActionType: flow_uuid = action["flow"]["uuid"] # there might be some inactive flows listed as dependencies, ignore if flow_uuid in valid_flow_map: expected_flow_deps[flow_id].add(valid_flow_map[flow_uuid]) rulesets = ( RuleSet.objects.filter(flow_id__in=flow_ids_chunk, ruleset_type=rulesetTypeSubflow) .values("flow_id") .annotate(configs=ArrayAgg("config")) ) for ruleset in rulesets: flow_id = ruleset["flow_id"] ruleset_configs = ruleset["configs"] for config in ruleset_configs: flow_uuid = config["flow"]["uuid"] # there might be some inactive flows listed as dependencies, ignore if flow_uuid in valid_flow_map: expected_flow_deps[flow_id].add(valid_flow_map[flow_uuid]) processed_flows += len(flow_ids_chunk) print(f"Processed {processed_flows}/{total_flows} in {time.monotonic() - chunk_start_time}") print(f"Collected flows and dependencies in {time.monotonic() - start_time}") print("Comparing actual to expected flow dependencies...") flow_dep_ids = list(expected_flow_deps.keys()) total_added_deps = 0 bulk_deps_to_add = list() for from_flow_id in flow_dep_ids: actual_flow_dep_results = ( FlowFlowDeps.objects.filter(from_flow_id=from_flow_id) .values("from_flow_id") .annotate(deps=ArrayAgg("to_flow_id")) ) actual_flow_dep = next(actual_flow_dep_results.iterator(), None) if actual_flow_dep: actual_deps = set(actual_flow_dep["deps"]) else: actual_deps = set() deps_to_add = expected_flow_deps[from_flow_id].difference(actual_deps) total_added_deps += len(deps_to_add) for dep in deps_to_add: bulk_deps_to_add.append(FlowFlowDeps(from_flow_id=from_flow_id, to_flow_id=dep)) FlowFlowDeps.objects.bulk_create(bulk_deps_to_add) print(f"Total added missing deps: {total_added_deps}")