def process_raw_message_batch(realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_handler: UserHandler, attachment_handler: AttachmentHandler, get_recipient_id: Callable[[ZerverFieldsT], int], is_pm_data: bool, output_dir: str) -> None: def fix_mentions(content: str, mention_user_ids: List[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) hipchat_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(hipchat_mention, zulip_mention) content = content.replace('@here', '@**all**') return content mention_map = dict() # type: Dict[int, Set[int]] def make_message(message_id: int, raw_message: ZerverFieldsT) -> ZerverFieldsT: # One side effect here: mention_map[message_id] = set(raw_message['mention_user_ids']) content = fix_mentions( content=raw_message['content'], mention_user_ids=raw_message['mention_user_ids'], ) pub_date = raw_message['pub_date'] recipient_id = get_recipient_id(raw_message) rendered_content = None if is_pm_data: subject = '' else: subject = 'imported from hipchat' user_id = raw_message['sender_id'] # Another side effect: extra_content = attachment_handler.handle_message_data( realm_id=realm_id, message_id=message_id, sender_id=user_id, attachment=raw_message['attachment'], files_dir=raw_message['files_dir'], ) if extra_content: has_attachment = True content += '\n' + extra_content else: has_attachment = False return build_message( content=content, message_id=message_id, pub_date=pub_date, recipient_id=recipient_id, rendered_content=rendered_content, subject=subject, user_id=user_id, has_attachment=has_attachment, ) zerver_message = [ make_message(message_id=NEXT_ID('message'), raw_message=raw_message) for raw_message in raw_messages ] zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id') message_file = "/messages-%06d.json" % (dump_file_id, ) create_converted_data_files(message_json, output_dir, message_file)
def channel_message_to_zerver_message( realm_id: int, users: List[ZerverFieldsT], slack_user_id_to_zulip_user_id: SlackToZulipUserIDT, slack_recipient_name_to_zulip_recipient_id: SlackToZulipRecipientT, all_messages: List[ZerverFieldsT], zerver_realmemoji: List[ZerverFieldsT], subscriber_map: Dict[int, Set[int]], added_channels: AddedChannelsT, dm_members: DMMembersT, domain_name: str, long_term_idle: Set[int], ) -> Tuple[ List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT], ]: """ Returns: 1. zerver_message, which is a list of the messages 2. zerver_usermessage, which is a list of the usermessages 3. zerver_attachment, which is a list of the attachments 4. uploads_list, which is a list of uploads to be mapped in uploads records.json 5. reaction_list, which is a list of all user reactions """ zerver_message = [] zerver_usermessage: List[ZerverFieldsT] = [] uploads_list: List[ZerverFieldsT] = [] zerver_attachment: List[ZerverFieldsT] = [] reaction_list: List[ZerverFieldsT] = [] total_user_messages = 0 total_skipped_user_messages = 0 for message in all_messages: slack_user_id = get_message_sending_user(message) if not slack_user_id: # Ignore messages without slack_user_id # These are Sometimes produced by Slack continue subtype = message.get("subtype", False) if subtype in [ # Zulip doesn't have a pinned_item concept "pinned_item", "unpinned_item", # Slack's channel join/leave notices are spammy "channel_join", "channel_leave", "channel_name", ]: continue try: content, mentioned_user_ids, has_link = convert_to_zulip_markdown( message["text"], users, added_channels, slack_user_id_to_zulip_user_id ) except Exception: print("Slack message unexpectedly missing text representation:") print(orjson.dumps(message, option=orjson.OPT_INDENT_2).decode()) continue rendered_content = None if "channel_name" in message: is_private = False recipient_id = slack_recipient_name_to_zulip_recipient_id[message["channel_name"]] elif "mpim_name" in message: is_private = True recipient_id = slack_recipient_name_to_zulip_recipient_id[message["mpim_name"]] elif "pm_name" in message: is_private = True sender = get_message_sending_user(message) members = dm_members[message["pm_name"]] if sender == members[0]: recipient_id = slack_recipient_name_to_zulip_recipient_id[members[1]] sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[members[0]] else: recipient_id = slack_recipient_name_to_zulip_recipient_id[members[0]] sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[members[1]] message_id = NEXT_ID("message") if "reactions" in message.keys(): build_reactions( reaction_list, message["reactions"], slack_user_id_to_zulip_user_id, message_id, zerver_realmemoji, ) # Process different subtypes of slack messages # Subtypes which have only the action in the message should # be rendered with '/me' in the content initially # For example "sh_room_created" has the message 'started a call' # which should be displayed as '/me started a call' if subtype in ["bot_add", "sh_room_created", "me_message"]: content = f"/me {content}" if subtype == "file_comment": # The file_comment message type only indicates the # responsible user in a subfield. message["user"] = message["comment"]["user"] file_info = process_message_files( message=message, domain_name=domain_name, realm_id=realm_id, message_id=message_id, slack_user_id=slack_user_id, users=users, slack_user_id_to_zulip_user_id=slack_user_id_to_zulip_user_id, zerver_attachment=zerver_attachment, uploads_list=uploads_list, ) content += file_info["content"] has_link = has_link or file_info["has_link"] has_attachment = file_info["has_attachment"] has_image = file_info["has_image"] topic_name = "imported from Slack" zulip_message = build_message( topic_name, float(message["ts"]), message_id, content, rendered_content, slack_user_id_to_zulip_user_id[slack_user_id], recipient_id, has_image, has_link, has_attachment, ) zerver_message.append(zulip_message) (num_created, num_skipped) = build_usermessages( zerver_usermessage=zerver_usermessage, subscriber_map=subscriber_map, recipient_id=recipient_id, mentioned_user_ids=mentioned_user_ids, message_id=message_id, is_private=is_private, long_term_idle=long_term_idle, ) total_user_messages += num_created total_skipped_user_messages += num_skipped if "pm_name" in message and recipient_id != sender_recipient_id: (num_created, num_skipped) = build_usermessages( zerver_usermessage=zerver_usermessage, subscriber_map=subscriber_map, recipient_id=sender_recipient_id, mentioned_user_ids=mentioned_user_ids, message_id=message_id, is_private=is_private, long_term_idle=long_term_idle, ) total_user_messages += num_created total_skipped_user_messages += num_skipped logging.debug( "Created %s UserMessages; deferred %s due to long-term idle", total_user_messages, total_skipped_user_messages, ) return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, reaction_list
def write_emoticon_data(realm_id: int, data_dir: str, output_dir: str) -> List[ZerverFieldsT]: ''' This function does most of the work for processing emoticons, the bulk of which is copying files. We also write a json file with metadata. Finally, we return a list of RealmEmoji dicts to our caller. In our data_dir we have a pretty simple setup: emoticons.json - has very simple metadata on emojis: { "Emoticon": { "id": 9875487, "path": "emoticons/yasss.jpg", "shortcut": "yasss" } }, { "Emoticon": { "id": 718017, "path": "emoticons/yayyyyy.gif", "shortcut": "yayyyyy" } } emoticons/ - contains a bunch of image files: slytherinsnake.gif spanishinquisition.jpg sparkle.png spiderman.gif stableparrot.gif stalkerparrot.gif supergirl.png superman.png We move all the relevant files to Zulip's more nested directory structure. ''' logging.info('Starting to process emoticons') fn = 'emoticons.json' data_file = os.path.join(data_dir, fn) with open(data_file) as f: data = ujson.load(f) flat_data = [ dict( path=d['Emoticon']['path'], name=d['Emoticon']['shortcut'], ) for d in data ] emoji_folder = os.path.join(output_dir, 'emoji') os.makedirs(emoji_folder, exist_ok=True) def process(data: ZerverFieldsT) -> ZerverFieldsT: source_sub_path = data['path'] source_fn = os.path.basename(source_sub_path) source_path = os.path.join(data_dir, source_sub_path) # Use our template from RealmEmoji # PATH_ID_TEMPLATE = "{realm_id}/emoji/images/{emoji_file_name}" target_fn = source_fn target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format( realm_id=realm_id, emoji_file_name=target_fn, ) target_path = os.path.join(emoji_folder, target_sub_path) os.makedirs(os.path.dirname(target_path), exist_ok=True) source_path = os.path.abspath(source_path) target_path = os.path.abspath(target_path) shutil.copyfile(source_path, target_path) return dict( path=target_path, file_name=target_fn, realm_id=realm_id, name=data['name'], ) emoji_records = list(map(process, flat_data)) create_converted_data_files(emoji_records, output_dir, '/emoji/records.json') realmemoji = [ build_realm_emoji( realm_id=realm_id, name=rec['name'], id=NEXT_ID('realmemoji'), file_name=rec['file_name'], ) for rec in emoji_records ] logging.info('Done processing emoticons') return realmemoji
def write_emoticon_data(realm_id: int, custom_emoji_data: List[Dict[str, Any]], data_dir: str, output_dir: str) -> List[ZerverFieldsT]: ''' This function does most of the work for processing emoticons, the bulk of which is copying files. We also write a json file with metadata. Finally, we return a list of RealmEmoji dicts to our caller. In our data_dir we have a pretty simple setup: The exported JSON file will have emoji rows if it contains any custom emoji { "type": "emoji", "emoji": {"name": "peerdium", "image": "exported_emoji/h15ni7kf1bnj7jeua4qhmctsdo/image"} } { "type": "emoji", "emoji": {"name": "tick", "image": "exported_emoji/7u7x8ytgp78q8jir81o9ejwwnr/image"} } exported_emoji/ - contains a bunch of image files: exported_emoji/7u7x8ytgp78q8jir81o9ejwwnr/image exported_emoji/h15ni7kf1bnj7jeua4qhmctsdo/image We move all the relevant files to Zulip's more nested directory structure. ''' logging.info('Starting to process emoticons') flat_data = [ dict( path=d['image'], name=d['name'], ) for d in custom_emoji_data ] emoji_folder = os.path.join(output_dir, 'emoji') os.makedirs(emoji_folder, exist_ok=True) def process(data: ZerverFieldsT) -> ZerverFieldsT: source_sub_path = data['path'] source_path = os.path.join(data_dir, source_sub_path) target_fn = data["name"] target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format( realm_id=realm_id, emoji_file_name=target_fn, ) target_path = os.path.join(emoji_folder, target_sub_path) os.makedirs(os.path.dirname(target_path), exist_ok=True) source_path = os.path.abspath(source_path) target_path = os.path.abspath(target_path) shutil.copyfile(source_path, target_path) return dict( path=target_path, s3_path=target_path, file_name=target_fn, realm_id=realm_id, name=data['name'], ) emoji_records = list(map(process, flat_data)) create_converted_data_files(emoji_records, output_dir, '/emoji/records.json') realmemoji = [ build_realm_emoji( realm_id=realm_id, name=rec['name'], id=NEXT_ID('realmemoji'), file_name=rec['file_name'], ) for rec in emoji_records ] logging.info('Done processing emoticons') return realmemoji
def do_convert_data(mattermost_data_dir: str, output_dir: str, masking_content: bool) -> None: username_to_user = {} # type: Dict[str, Dict[str, Any]] os.makedirs(output_dir, exist_ok=True) if os.listdir(output_dir): # nocoverage raise Exception("Output directory should be empty!") mattermost_data_file = os.path.join(mattermost_data_dir, "export.json") mattermost_data = mattermost_data_file_to_dict(mattermost_data_file) username_to_user = create_username_to_user_mapping(mattermost_data["user"]) for team in mattermost_data["team"]: realm_id = NEXT_ID("realm_id") team_name = team["name"] user_handler = UserHandler() subscriber_handler = SubscriberHandler() user_id_mapper = IdMapper() stream_id_mapper = IdMapper() print("Generating data for", team_name) realm = make_realm(realm_id, team) realm_output_dir = os.path.join(output_dir, team_name) reset_mirror_dummy_users(username_to_user) label_mirror_dummy_users(len(mattermost_data["team"]), team_name, mattermost_data, username_to_user) convert_user_data( user_handler=user_handler, user_id_mapper=user_id_mapper, user_data_map=username_to_user, realm_id=realm_id, team_name=team_name, ) zerver_stream = convert_channel_data( channel_data=mattermost_data["channel"], user_data_map=username_to_user, subscriber_handler=subscriber_handler, stream_id_mapper=stream_id_mapper, user_id_mapper=user_id_mapper, realm_id=realm_id, team_name=team_name, ) realm['zerver_stream'] = zerver_stream all_users = user_handler.get_all_users() zerver_recipient = build_recipients( zerver_userprofile=all_users, zerver_stream=zerver_stream, ) realm['zerver_recipient'] = zerver_recipient stream_subscriptions = build_stream_subscriptions( get_users=subscriber_handler.get_users, zerver_recipient=zerver_recipient, zerver_stream=zerver_stream, ) personal_subscriptions = build_personal_subscriptions( zerver_recipient=zerver_recipient, ) # Mattermost currently supports only exporting messages from channels. # Personal messages and huddles are not exported. zerver_subscription = personal_subscriptions + stream_subscriptions realm['zerver_subscription'] = zerver_subscription zerver_realmemoji = write_emoticon_data( realm_id=realm_id, custom_emoji_data=mattermost_data["emoji"], data_dir=mattermost_data_dir, output_dir=realm_output_dir, ) realm['zerver_realmemoji'] = zerver_realmemoji subscriber_map = make_subscriber_map( zerver_subscription=zerver_subscription, ) total_reactions = [] # type: List[Dict[str, Any]] write_message_data( num_teams=len(mattermost_data["team"]), team_name=team_name, realm_id=realm_id, post_data=mattermost_data["post"], zerver_recipient=zerver_recipient, subscriber_map=subscriber_map, output_dir=realm_output_dir, masking_content=masking_content, stream_id_mapper=stream_id_mapper, user_id_mapper=user_id_mapper, user_handler=user_handler, username_to_user=username_to_user, zerver_realmemoji=zerver_realmemoji, total_reactions=total_reactions, ) realm['zerver_reaction'] = total_reactions realm['zerver_userprofile'] = user_handler.get_all_users() realm['sort_by_date'] = True create_converted_data_files(realm, realm_output_dir, '/realm.json') # Mattermost currently doesn't support exporting avatars create_converted_data_files([], realm_output_dir, '/avatars/records.json') # Mattermost currently doesn't support exporting uploads create_converted_data_files([], realm_output_dir, '/uploads/records.json') # Mattermost currently doesn't support exporting attachments attachment = {"zerver_attachment": []} # type: Dict[str, List[Any]] create_converted_data_files(attachment, realm_output_dir, '/attachment.json') logging.info('Start making tarball') subprocess.check_call([ "tar", "-czf", realm_output_dir + '.tar.gz', realm_output_dir, '-P' ]) logging.info('Done making tarball')
def channel_message_to_zerver_message( realm_id: int, users: List[ZerverFieldsT], added_users: AddedUsersT, added_recipient: AddedRecipientsT, all_messages: List[ZerverFieldsT], zerver_realmemoji: List[ZerverFieldsT], subscriber_map: Dict[int, Set[int]], added_channels: AddedChannelsT, domain_name: str, long_term_idle: Set[int] ) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT]]: """ Returns: 1. zerver_message, which is a list of the messages 2. zerver_usermessage, which is a list of the usermessages 3. zerver_attachment, which is a list of the attachments 4. uploads_list, which is a list of uploads to be mapped in uploads records.json 5. reaction_list, which is a list of all user reactions """ zerver_message = [] zerver_usermessage = [] # type: List[ZerverFieldsT] uploads_list = [] # type: List[ZerverFieldsT] zerver_attachment = [] # type: List[ZerverFieldsT] reaction_list = [] # type: List[ZerverFieldsT] # For unicode emoji with open(NAME_TO_CODEPOINT_PATH) as fp: name_to_codepoint = ujson.load(fp) total_user_messages = 0 total_skipped_user_messages = 0 for message in all_messages: user = get_message_sending_user(message) if not user: # Ignore messages without user names # These are Sometimes produced by slack continue subtype = message.get('subtype', False) if subtype in [ # Zulip doesn't have a pinned_item concept "pinned_item", "unpinned_item", # Slack's channel join/leave notices are spammy "channel_join", "channel_leave", "channel_name" ]: continue try: content, mentioned_user_ids, has_link = convert_to_zulip_markdown( message['text'], users, added_channels, added_users) except Exception: print("Slack message unexpectedly missing text representation:") print(ujson.dumps(message, indent=4)) continue rendered_content = None recipient_id = added_recipient[message['channel_name']] message_id = NEXT_ID('message') # Process message reactions if 'reactions' in message.keys(): build_reactions(reaction_list, message['reactions'], added_users, message_id, name_to_codepoint, zerver_realmemoji) # Process different subtypes of slack messages # Subtypes which have only the action in the message should # be rendered with '/me' in the content initially # For example "sh_room_created" has the message 'started a call' # which should be displayed as '/me started a call' if subtype in ["bot_add", "sh_room_created", "me_message"]: content = ('/me %s' % (content)) if subtype == 'file_comment': # The file_comment message type only indicates the # responsible user in a subfield. message['user'] = message['comment']['user'] file_info = process_message_files( message=message, domain_name=domain_name, realm_id=realm_id, message_id=message_id, user=user, users=users, added_users=added_users, zerver_attachment=zerver_attachment, uploads_list=uploads_list, ) content += file_info['content'] has_link = has_link or file_info['has_link'] has_attachment = file_info['has_attachment'] has_image = file_info['has_image'] # construct message topic_name = 'imported from slack' zulip_message = build_message(topic_name, float(message['ts']), message_id, content, rendered_content, added_users[user], recipient_id, has_image, has_link, has_attachment) zerver_message.append(zulip_message) # construct usermessages (num_created, num_skipped) = build_usermessages( zerver_usermessage=zerver_usermessage, subscriber_map=subscriber_map, recipient_id=recipient_id, mentioned_user_ids=mentioned_user_ids, message_id=message_id, long_term_idle=long_term_idle, ) total_user_messages += num_created total_skipped_user_messages += num_skipped logging.debug( "Created %s UserMessages; deferred %s due to long-term idle" % (total_user_messages, total_skipped_user_messages)) return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \ reaction_list
def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, get_recipient_id_from_receiver_name: Callable[[str, int], int], is_pm_data: bool, output_dir: str, zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) mattermost_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(mattermost_mention, zulip_mention) content = content.replace('@channel', '@**all**') content = content.replace('@all', '@**all**') # We don't have an equivalent for Mattermost's @here mention which mentions all users # online in the channel. content = content.replace('@here', '@**all**') return content mention_map = dict() # type: Dict[int, Set[int]] zerver_message = [] import html2text h = html2text.HTML2Text() name_to_codepoint = get_name_to_codepoint_dict() for raw_message in raw_messages: message_id = NEXT_ID('message') mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper) mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) content = h.handle(content) if len(content) > 10000: # nocoverage logging.info('skipping too-long message of length %s' % (len(content), )) continue date_sent = raw_message['date_sent'] sender_user_id = raw_message['sender_id'] try: recipient_id = get_recipient_id_from_receiver_name( raw_message["receiver_id"], Recipient.STREAM) except KeyError: logging.debug( "Could not find recipient_id for a message, skipping.") continue rendered_content = None topic_name = 'imported from mattermost' message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, has_attachment=False, ) zerver_message.append(message) build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id, name_to_codepoint, user_id_mapper, zerver_realmemoji) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id' + str(realm_id)) message_file = "/messages-%06d.json" % (dump_file_id, ) create_converted_data_files(message_json, output_dir, message_file)
def process_raw_message_batch(realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, attachment_handler: AttachmentHandler, get_recipient_id: Callable[[ZerverFieldsT], int], is_pm_data: bool, output_dir: str) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) hipchat_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(hipchat_mention, zulip_mention) content = content.replace('@here', '@**all**') return content mention_map = dict() # type: Dict[int, Set[int]] zerver_message = [] for raw_message in raw_messages: # One side effect here: message_id = NEXT_ID('message') mention_user_ids = { user_id_mapper.get(id) for id in set(raw_message['mention_user_ids']) if user_id_mapper.has(id) } mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) if len(content) > 10000: logging.info('skipping too-long message of length %s' % (len(content), )) continue pub_date = raw_message['pub_date'] try: recipient_id = get_recipient_id(raw_message) except KeyError: logging.debug( "Could not find recipient_id for a message, skipping.") continue rendered_content = None if is_pm_data: topic_name = '' else: topic_name = 'imported from hipchat' user_id = raw_message['sender_id'] # Another side effect: extra_content = attachment_handler.handle_message_data( realm_id=realm_id, message_id=message_id, sender_id=user_id, attachment=raw_message['attachment'], files_dir=raw_message['files_dir'], ) if extra_content: has_attachment = True content += '\n' + extra_content else: has_attachment = False message = build_message( content=content, message_id=message_id, pub_date=pub_date, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=user_id, has_attachment=has_attachment, ) zerver_message.append(message) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id') message_file = "/messages-%06d.json" % (dump_file_id, ) create_converted_data_files(message_json, output_dir, message_file)
def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_handler: UserHandler, is_pm_data: bool, output_dir: str, zerver_realmemoji: List[ZerverFieldsT], total_reactions: List[ZerverFieldsT], uploads_list: List[ZerverFieldsT], zerver_attachment: List[ZerverFieldsT], upload_id_to_upload_data_map: Dict[str, Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int], rc_channel_mention_data: List[Dict[str, str]]) -> str: # Fix user mentions for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) rc_mention = "@{short_name}".format(**user) zulip_mention = "@**{full_name}**".format(**user) content = content.replace(rc_mention, zulip_mention) content = content.replace("@all", "@**all**") # We don't have an equivalent for Rocket.Chat's @here mention # which mentions all users active in the channel. content = content.replace("@here", "@**all**") # Fix channel mentions for mention_data in rc_channel_mention_data: rc_mention = mention_data["rc_mention"] zulip_mention = mention_data["zulip_mention"] content = content.replace(rc_mention, zulip_mention) return content user_mention_map: Dict[int, Set[int]] = {} wildcard_mention_map: Dict[int, bool] = {} zerver_message: List[ZerverFieldsT] = [] for raw_message in raw_messages: message_id = NEXT_ID("message") mention_user_ids = raw_message["mention_user_ids"] user_mention_map[message_id] = mention_user_ids wildcard_mention_map[message_id] = raw_message["wildcard_mention"] content = fix_mentions( content=raw_message["content"], mention_user_ids=mention_user_ids, rc_channel_mention_data=raw_message["rc_channel_mention_data"], ) if len(content) > 10000: # nocoverage logging.info("skipping too-long message of length %s", len(content)) continue date_sent = raw_message["date_sent"] sender_user_id = raw_message["sender_id"] recipient_id = raw_message["recipient_id"] rendered_content = None has_attachment = False has_image = False has_link = raw_message["has_link"] if "file" in raw_message: has_attachment = True has_link = True attachment_content, has_image = process_message_attachment( upload=raw_message["file"], realm_id=realm_id, message_id=message_id, user_id=sender_user_id, user_handler=user_handler, uploads_list=uploads_list, zerver_attachment=zerver_attachment, upload_id_to_upload_data_map=upload_id_to_upload_data_map, output_dir=output_dir, ) content += attachment_content topic_name = raw_message["topic_name"] message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, has_image=has_image, has_link=has_link, has_attachment=has_attachment, ) zerver_message.append(message) build_reactions( total_reactions=total_reactions, reactions=raw_message["reactions"], message_id=message_id, zerver_realmemoji=zerver_realmemoji, ) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=user_mention_map, wildcard_mention_map=wildcard_mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID("dump_file_id" + str(realm_id)) message_file = f"/messages-{dump_file_id:06}.json" create_converted_data_files(message_json, output_dir, message_file)
def build_custom_emoji(realm_id: int, custom_emoji_data: Dict[str, List[Dict[str, Any]]], output_dir: str) -> List[ZerverFieldsT]: logging.info("Starting to process custom emoji") emoji_folder = os.path.join(output_dir, "emoji") os.makedirs(emoji_folder, exist_ok=True) zerver_realmemoji: List[ZerverFieldsT] = [] emoji_records: List[ZerverFieldsT] = [] # Map emoji file_id to emoji file data emoji_file_data = {} for emoji_file in custom_emoji_data["file"]: emoji_file_data[emoji_file["_id"]] = { "filename": emoji_file["filename"], "chunks": [] } for emoji_chunk in custom_emoji_data["chunk"]: emoji_file_data[emoji_chunk["files_id"]]["chunks"].append( emoji_chunk["data"]) # Build custom emoji for rc_emoji in custom_emoji_data["emoji"]: # Subject to change with changes in database emoji_file_id = ".".join([rc_emoji["name"], rc_emoji["extension"]]) emoji_file_info = emoji_file_data[emoji_file_id] emoji_filename = emoji_file_info["filename"] emoji_data = b"".join(emoji_file_info["chunks"]) target_sub_path = RealmEmoji.PATH_ID_TEMPLATE.format( realm_id=realm_id, emoji_file_name=emoji_filename, ) target_path = os.path.join(emoji_folder, target_sub_path) os.makedirs(os.path.dirname(target_path), exist_ok=True) with open(target_path, "wb") as e_file: e_file.write(emoji_data) emoji_aliases = [rc_emoji["name"]] emoji_aliases.extend(rc_emoji["aliases"]) for alias in emoji_aliases: emoji_record = dict( path=target_path, s3_path=target_path, file_name=emoji_filename, realm_id=realm_id, name=alias, ) emoji_records.append(emoji_record) realmemoji = build_realm_emoji( realm_id=realm_id, name=alias, id=NEXT_ID("realmemoji"), file_name=emoji_filename, ) zerver_realmemoji.append(realmemoji) create_converted_data_files(emoji_records, output_dir, "/emoji/records.json") logging.info("Done processing emoji") return zerver_realmemoji
def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_id_mapper: IdMapper, user_handler: UserHandler, get_recipient_id_from_receiver_name: Callable[[str, int], int], is_pm_data: bool, output_dir: str, zerver_realmemoji: List[Dict[str, Any]], total_reactions: List[Dict[str, Any]], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) mattermost_mention = '@{short_name}'.format(**user) zulip_mention = '@**{full_name}**'.format(**user) content = content.replace(mattermost_mention, zulip_mention) content = content.replace('@channel', '@**all**') content = content.replace('@all', '@**all**') # We don't have an equivalent for Mattermost's @here mention which mentions all users # online in the channel. content = content.replace('@here', '@**all**') return content mention_map: Dict[int, Set[int]] = {} zerver_message = [] import html2text h = html2text.HTML2Text() pm_members = {} for raw_message in raw_messages: message_id = NEXT_ID('message') mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper) mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message['content'], mention_user_ids=mention_user_ids, ) content = h.handle(content) if len(content) > 10000: # nocoverage logging.info('skipping too-long message of length %s', len(content)) continue date_sent = raw_message['date_sent'] sender_user_id = raw_message['sender_id'] if "channel_name" in raw_message: recipient_id = get_recipient_id_from_receiver_name( raw_message["channel_name"], Recipient.STREAM) elif "huddle_name" in raw_message: recipient_id = get_recipient_id_from_receiver_name( raw_message["huddle_name"], Recipient.HUDDLE) elif "pm_members" in raw_message: members = raw_message["pm_members"] member_ids = {user_id_mapper.get(member) for member in members} pm_members[message_id] = member_ids if sender_user_id == user_id_mapper.get(members[0]): recipient_id = get_recipient_id_from_receiver_name( members[1], Recipient.PERSONAL) else: recipient_id = get_recipient_id_from_receiver_name( members[0], Recipient.PERSONAL) else: raise AssertionError( "raw_message without channel_name, huddle_name or pm_members key" ) rendered_content = None topic_name = 'imported from mattermost' message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, has_attachment=False, ) zerver_message.append(message) build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id, user_id_mapper, zerver_realmemoji) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID('dump_file_id' + str(realm_id)) message_file = f"/messages-{dump_file_id:06}.json" create_converted_data_files(message_json, output_dir, message_file)
def process_raw_message_batch( realm_id: int, raw_messages: List[Dict[str, Any]], subscriber_map: Dict[int, Set[int]], user_handler: UserHandler, is_pm_data: bool, output_dir: str, total_reactions: List[ZerverFieldsT], ) -> None: def fix_mentions(content: str, mention_user_ids: Set[int]) -> str: for user_id in mention_user_ids: user = user_handler.get_user(user_id=user_id) rc_mention = "@{short_name}".format(**user) zulip_mention = "@**{full_name}**".format(**user) content = content.replace(rc_mention, zulip_mention) content = content.replace("@all", "@**all**") # We don't have an equivalent for Rocket.Chat's @here mention # which mentions all users active in the channel. content = content.replace("@here", "@**all**") return content mention_map: Dict[int, Set[int]] = {} zerver_message: List[ZerverFieldsT] = [] for raw_message in raw_messages: message_id = NEXT_ID("message") mention_user_ids = raw_message["mention_user_ids"] mention_map[message_id] = mention_user_ids content = fix_mentions( content=raw_message["content"], mention_user_ids=mention_user_ids, ) if len(content) > 10000: # nocoverage logging.info("skipping too-long message of length %s", len(content)) continue date_sent = raw_message["date_sent"] sender_user_id = raw_message["sender_id"] recipient_id = raw_message["recipient_id"] rendered_content = None topic_name = raw_message["topic_name"] message = build_message( content=content, message_id=message_id, date_sent=date_sent, recipient_id=recipient_id, rendered_content=rendered_content, topic_name=topic_name, user_id=sender_user_id, has_attachment=False, ) zerver_message.append(message) build_reactions( total_reactions=total_reactions, reactions=raw_message["reactions"], message_id=message_id, ) zerver_usermessage = make_user_messages( zerver_message=zerver_message, subscriber_map=subscriber_map, is_pm_data=is_pm_data, mention_map=mention_map, ) message_json = dict( zerver_message=zerver_message, zerver_usermessage=zerver_usermessage, ) dump_file_id = NEXT_ID("dump_file_id" + str(realm_id)) message_file = f"/messages-{dump_file_id:06}.json" create_converted_data_files(message_json, output_dir, message_file)