def choose_date_sent(num_messages: int, tot_messages: int, threads: int) -> datetime: # Spoofing time not supported with threading if threads != 1: return timezone_now() # Distrubutes 80% of messages starting from 5 days ago, over a period # of 3 days. Then, distributes remaining messages over past 24 hours. amount_in_first_chunk = int(tot_messages * 0.8) amount_in_second_chunk = tot_messages - amount_in_first_chunk if (num_messages < amount_in_first_chunk): # Distribute starting from 5 days ago, over a period # of 3 days: spoofed_date = timezone_now() - timezone_timedelta(days=5) interval_size = 3 * 24 * 60 * 60 / amount_in_first_chunk lower_bound = interval_size * num_messages upper_bound = interval_size * (num_messages + 1) else: # We're in the last 20% of messages, distribute them over the last 24 hours: spoofed_date = timezone_now() - timezone_timedelta(days=1) interval_size = 24 * 60 * 60 / amount_in_second_chunk lower_bound = interval_size * (num_messages - amount_in_first_chunk) upper_bound = interval_size * (num_messages - amount_in_first_chunk + 1) offset_seconds = random.uniform(lower_bound, upper_bound) spoofed_date += timezone_timedelta(seconds=offset_seconds) return spoofed_date
def choose_date_sent( num_messages: int, tot_messages: int, oldest_message_days: int, threads: int ) -> datetime: # Spoofing time not supported with threading if threads != 1: return timezone_now() # We want to ensure that: # (1) some messages are sent in the last 4 hours, # (2) there are some >24hr gaps between adjacent messages, and # (3) a decent bulk of messages in the last day so you see adjacent messages with the same date. # So we distribute 80% of messages starting from oldest_message_days days ago, over a period # of the first min(oldest_message_days-2, 1) of those days. Then, distributes remaining messages # over the past 24 hours. amount_in_first_chunk = int(tot_messages * 0.8) amount_in_second_chunk = tot_messages - amount_in_first_chunk if num_messages < amount_in_first_chunk: spoofed_date = timezone_now() - timezone_timedelta(days=oldest_message_days) num_days_for_first_chunk = min(oldest_message_days - 2, 1) interval_size = num_days_for_first_chunk * 24 * 60 * 60 / amount_in_first_chunk lower_bound = interval_size * num_messages upper_bound = interval_size * (num_messages + 1) else: # We're in the last 20% of messages, so distribute them over the last 24 hours: spoofed_date = timezone_now() - timezone_timedelta(days=1) interval_size = 24 * 60 * 60 / amount_in_second_chunk lower_bound = interval_size * (num_messages - amount_in_first_chunk) upper_bound = interval_size * (num_messages - amount_in_first_chunk + 1) offset_seconds = random.uniform(lower_bound, upper_bound) spoofed_date += timezone_timedelta(seconds=offset_seconds) return spoofed_date
def test_choose_pub_date_large_tot_messages(self) -> None: """ Test for a bug that was present, where specifying a large amount of messages to generate would cause each message to have pub_date set to timezone_now(), instead of the pub_dates being distributed across the span of several days. """ tot_messages = 1000000 datetimes_list = [ choose_pub_date(i, tot_messages, 1) for i in range(1, tot_messages, tot_messages // 100) ] # Verify there is a meaningful difference between elements. for i in range(1, len(datetimes_list)): self.assertTrue( datetimes_list[i] - datetimes_list[i - 1] > timezone_timedelta(minutes=5))
def send_messages( data: Tuple[int, Sequence[Sequence[int]], Mapping[str, Any], Callable[[str], Any], int] ) -> int: (tot_messages, personals_pairs, options, output, random_seed) = data random.seed(random_seed) with open("var/test_messages.json", "r") as infile: dialog = ujson.load(infile) random.shuffle(dialog) texts = itertools.cycle(dialog) recipient_streams = [ klass.id for klass in Recipient.objects.filter(type=Recipient.STREAM) ] # type: List[int] recipient_huddles = [ h.id for h in Recipient.objects.filter(type=Recipient.HUDDLE) ] # type: List[int] huddle_members = {} # type: Dict[int, List[int]] for h in recipient_huddles: huddle_members[h] = [ s.user_profile.id for s in Subscription.objects.filter(recipient_id=h) ] num_messages = 0 random_max = 1000000 recipients = {} # type: Dict[int, Tuple[int, int, Dict[str, Any]]] while num_messages < tot_messages: saved_data = {} # type: Dict[str, Any] message = Message() message.sending_client = get_client('populate_db') message.content = next(texts) randkey = random.randint(1, random_max) if (num_messages > 0 and random.randint(1, random_max) * 100. / random_max < options["stickyness"]): # Use an old recipient message_type, recipient_id, saved_data = recipients[num_messages - 1] if message_type == Recipient.PERSONAL: personals_pair = saved_data['personals_pair'] random.shuffle(personals_pair) elif message_type == Recipient.STREAM: message.subject = saved_data['subject'] message.recipient = get_recipient_by_id(recipient_id) elif message_type == Recipient.HUDDLE: message.recipient = get_recipient_by_id(recipient_id) elif (randkey <= random_max * options["percent_huddles"] / 100.): message_type = Recipient.HUDDLE message.recipient = get_recipient_by_id( random.choice(recipient_huddles)) elif (randkey <= random_max * (options["percent_huddles"] + options["percent_personals"]) / 100.): message_type = Recipient.PERSONAL personals_pair = random.choice(personals_pairs) random.shuffle(personals_pair) elif (randkey <= random_max * 1.0): message_type = Recipient.STREAM message.recipient = get_recipient_by_id( random.choice(recipient_streams)) if message_type == Recipient.HUDDLE: sender_id = random.choice(huddle_members[message.recipient.id]) message.sender = get_user_profile_by_id(sender_id) elif message_type == Recipient.PERSONAL: message.recipient = Recipient.objects.get( type=Recipient.PERSONAL, type_id=personals_pair[0]) message.sender = get_user_profile_by_id(personals_pair[1]) saved_data['personals_pair'] = personals_pair elif message_type == Recipient.STREAM: stream = Stream.objects.get(id=message.recipient.type_id) # Pick a random subscriber to the stream message.sender = random.choice( Subscription.objects.filter( recipient=message.recipient)).user_profile message.subject = stream.name + str(random.randint(1, 3)) saved_data['subject'] = message.subject # Spoofing time not supported with threading if options['threads'] != 1: message.pub_date = timezone_now() else: # Distrubutes 80% of messages starting from 5 days ago, over a period # of 3 days. Then, distributes remaining messages over past 24 hours. spoofed_date = timezone_now() - timezone_timedelta(days=5) if (num_messages < tot_messages * 0.8): # Maximum of 3 days ahead, convert to minutes time_ahead = 3 * 24 * 60 time_ahead //= int(tot_messages * 0.8) else: time_ahead = 24 * 60 time_ahead //= int(tot_messages * 0.2) spoofed_minute = random.randint(time_ahead * num_messages, time_ahead * (num_messages + 1)) spoofed_date += timezone_timedelta(minutes=spoofed_minute) message.pub_date = spoofed_date # We disable USING_RABBITMQ here, so that deferred work is # executed in do_send_message_messages, rather than being # queued. This is important, because otherwise, if run-dev.py # wasn't running when populate_db was run, a developer can end # up with queued events that reference objects from a previous # life of the database, which naturally throws exceptions. settings.USING_RABBITMQ = False do_send_messages([{'message': message}]) settings.USING_RABBITMQ = True recipients[num_messages] = (message_type, message.recipient.id, saved_data) num_messages += 1 return tot_messages
def send_messages(data: Tuple[int, Sequence[Sequence[int]], Mapping[str, Any], Callable[[str], Any], int]) -> int: (tot_messages, personals_pairs, options, output, random_seed) = data random.seed(random_seed) with open("var/test_messages.json", "r") as infile: dialog = ujson.load(infile) random.shuffle(dialog) texts = itertools.cycle(dialog) recipient_streams = [klass.id for klass in Recipient.objects.filter(type=Recipient.STREAM)] # type: List[int] recipient_huddles = [h.id for h in Recipient.objects.filter(type=Recipient.HUDDLE)] # type: List[int] huddle_members = {} # type: Dict[int, List[int]] for h in recipient_huddles: huddle_members[h] = [s.user_profile.id for s in Subscription.objects.filter(recipient_id=h)] num_messages = 0 random_max = 1000000 recipients = {} # type: Dict[int, Tuple[int, int, Dict[str, Any]]] while num_messages < tot_messages: saved_data = {} # type: Dict[str, Any] message = Message() message.sending_client = get_client('populate_db') message.content = next(texts) randkey = random.randint(1, random_max) if (num_messages > 0 and random.randint(1, random_max) * 100. / random_max < options["stickyness"]): # Use an old recipient message_type, recipient_id, saved_data = recipients[num_messages - 1] if message_type == Recipient.PERSONAL: personals_pair = saved_data['personals_pair'] random.shuffle(personals_pair) elif message_type == Recipient.STREAM: message.subject = saved_data['subject'] message.recipient = get_recipient_by_id(recipient_id) elif message_type == Recipient.HUDDLE: message.recipient = get_recipient_by_id(recipient_id) elif (randkey <= random_max * options["percent_huddles"] / 100.): message_type = Recipient.HUDDLE message.recipient = get_recipient_by_id(random.choice(recipient_huddles)) elif (randkey <= random_max * (options["percent_huddles"] + options["percent_personals"]) / 100.): message_type = Recipient.PERSONAL personals_pair = random.choice(personals_pairs) random.shuffle(personals_pair) elif (randkey <= random_max * 1.0): message_type = Recipient.STREAM message.recipient = get_recipient_by_id(random.choice(recipient_streams)) if message_type == Recipient.HUDDLE: sender_id = random.choice(huddle_members[message.recipient.id]) message.sender = get_user_profile_by_id(sender_id) elif message_type == Recipient.PERSONAL: message.recipient = Recipient.objects.get(type=Recipient.PERSONAL, type_id=personals_pair[0]) message.sender = get_user_profile_by_id(personals_pair[1]) saved_data['personals_pair'] = personals_pair elif message_type == Recipient.STREAM: stream = Stream.objects.get(id=message.recipient.type_id) # Pick a random subscriber to the stream message.sender = random.choice(Subscription.objects.filter( recipient=message.recipient)).user_profile message.subject = stream.name + str(random.randint(1, 3)) saved_data['subject'] = message.subject # Spoofing time not supported with threading if options['threads'] != 1: message.pub_date = timezone_now() else: # Distrubutes 80% of messages starting from 5 days ago, over a period # of 3 days. Then, distributes remaining messages over past 24 hours. spoofed_date = timezone_now() - timezone_timedelta(days = 5) if (num_messages < tot_messages * 0.8): # Maximum of 3 days ahead, convert to minutes time_ahead = 3 * 24 * 60 time_ahead //= int(tot_messages * 0.8) else: time_ahead = 24 * 60 time_ahead //= int(tot_messages * 0.2) spoofed_minute = random.randint(time_ahead * num_messages, time_ahead * (num_messages + 1)) spoofed_date += timezone_timedelta(minutes = spoofed_minute) message.pub_date = spoofed_date # We disable USING_RABBITMQ here, so that deferred work is # executed in do_send_message_messages, rather than being # queued. This is important, because otherwise, if run-dev.py # wasn't running when populate_db was run, a developer can end # up with queued events that reference objects from a previous # life of the database, which naturally throws exceptions. settings.USING_RABBITMQ = False do_send_messages([{'message': message}]) settings.USING_RABBITMQ = True recipients[num_messages] = (message_type, message.recipient.id, saved_data) num_messages += 1 return tot_messages