Esempio n. 1
0
def bulk_import_user_message_data(data: TableData, dump_file_id: int) -> None:
    model = UserMessage
    table = 'zerver_usermessage'
    lst = data[table]

    def process_batch(items: List[Dict[str, Any]]) -> None:
        ums = [
            UserMessageLite(
                user_profile_id = item['user_profile_id'],
                message_id = item['message_id'],
                flags=item['flags'],
            )
            for item in items
        ]
        bulk_insert_ums(ums)

    chunk_size = 10000

    process_list_in_batches(
        lst=lst,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )

    logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id))
Esempio n. 2
0
def bulk_import_user_message_data(data: TableData, dump_file_id: int) -> None:
    model = UserMessage
    table = 'zerver_usermessage'
    lst = data[table]

    # IMPORTANT NOTE: We do not use any primary id
    # data from either the import itself or ID_MAP.
    # We let the DB itself generate ids.  Note that
    # no tables use user_message.id as a foreign key,
    # so we can safely avoid all re-mapping complexity.

    def process_batch(items: List[Dict[str, Any]]) -> None:
        ums = [
            UserMessageLite(
                user_profile_id = item['user_profile_id'],
                message_id = item['message_id'],
                flags=item['flags'],
            )
            for item in items
        ]
        bulk_insert_ums(ums)

    chunk_size = 10000

    process_list_in_batches(
        lst=lst,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )

    logging.info("Successfully imported %s from %s[%s]." % (model, table, dump_file_id))
Esempio n. 3
0
def process_message_file(realm_id: int,
                         fn: str,
                         fn_id: int,
                         files_dir: str,
                         get_recipient_id: Callable[[ZerverFieldsT], int],
                         message_key: str,
                         zerver_subscription: List[ZerverFieldsT],
                         data_dir: str,
                         output_dir: str,
                         user_handler: UserHandler,
                         attachment_handler: AttachmentHandler) -> None:

    def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
        data = json.load(open(fn))

        flat_data = [
            d[message_key]
            for d in data
            if message_key in d
        ]

        return [
            dict(
                fn_id=fn_id,
                sender_id=d['sender']['id'],
                receiver_id=d.get('receiver', {}).get('id'),
                content=d['message'],
                mention_user_ids=d['mentions'],
                pub_date=str_date_to_float(d['timestamp']),
                attachment=d['attachment'],
                files_dir=files_dir,
            )
            for d in flat_data
        ]

    raw_messages = get_raw_messages(fn)

    def process_batch(lst: List[Any]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            zerver_subscription=zerver_subscription,
            user_handler=user_handler,
            attachment_handler=attachment_handler,
            get_recipient_id=get_recipient_id,
            output_dir=output_dir,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 4
0
def process_message_file(realm_id: int, slim_mode: bool, fn: str, fn_id: str,
                         files_dir: str,
                         get_recipient_id: Callable[[ZerverFieldsT], int],
                         message_key: str, subscriber_map: Dict[int, Set[int]],
                         data_dir: str, output_dir: str, is_pm_data: bool,
                         masking_content: bool, user_id_mapper: IdMapper,
                         user_handler: UserHandler,
                         attachment_handler: AttachmentHandler) -> None:
    def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
        with open(fn) as f:
            data = ujson.load(f)

        flat_data = [d[message_key] for d in data if message_key in d]

        def get_raw_message(d: Dict[str, Any]) -> Optional[ZerverFieldsT]:
            sender_id = get_hipchat_sender_id(
                realm_id=realm_id,
                slim_mode=slim_mode,
                message_dict=d,
                user_id_mapper=user_id_mapper,
                user_handler=user_handler,
            )

            if sender_id is None:
                return None

            if is_pm_data:
                # We need to compare with str() on both sides here.
                # In Stride, user IDs are strings, but in HipChat,
                # they are integers, and fn_id is always a string.
                if str(sender_id) != str(fn_id):
                    # PMs are in multiple places in the Hipchat export,
                    # and we only use the copy from the sender
                    return None

            content = d['message']

            if masking_content:
                content = re.sub('[a-z]', 'x', content)
                content = re.sub('[A-Z]', 'X', content)

            return dict(
                fn_id=fn_id,
                sender_id=sender_id,
                receiver_id=d.get('receiver', {}).get('id'),
                content=content,
                mention_user_ids=d.get('mentions', []),
                date_sent=str_date_to_float(d['timestamp']),
                attachment=d.get('attachment'),
                files_dir=files_dir,
            )

        raw_messages = []

        for d in flat_data:
            raw_message = get_raw_message(d)
            if raw_message is not None:
                raw_messages.append(raw_message)

        return raw_messages

    raw_messages = get_raw_messages(fn)

    def process_batch(lst: List[Any]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_id_mapper=user_id_mapper,
            user_handler=user_handler,
            attachment_handler=attachment_handler,
            get_recipient_id=get_recipient_id,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 5
0
def process_posts(
    num_teams: int,
    team_name: str,
    realm_id: int,
    post_data: List[Dict[str, Any]],
    get_recipient_id_from_receiver_name: Callable[[str, int], int],
    subscriber_map: Dict[int, Set[int]],
    output_dir: str,
    is_pm_data: bool,
    masking_content: bool,
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    zerver_realmemoji: List[Dict[str, Any]],
    total_reactions: List[Dict[str, Any]],
    uploads_list: List[ZerverFieldsT],
    zerver_attachment: List[ZerverFieldsT],
    mattermost_data_dir: str,
) -> None:

    post_data_list = []
    for post in post_data:
        if "team" not in post:
            # Mattermost doesn't specify a team for private messages
            # in its export format.  This line of code requires that
            # we only be importing data from a single team (checked
            # elsewhere) -- we just assume it's the target team.
            post_team = team_name
        else:
            post_team = post["team"]
        if post_team == team_name:
            post_data_list.append(post)

    def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]:
        sender_username = post_dict["user"]
        sender_id = user_id_mapper.get(sender_username)
        content = post_dict["message"]

        if masking_content:
            content = re.sub("[a-z]", "x", content)
            content = re.sub("[A-Z]", "X", content)

        if "reactions" in post_dict:
            reactions = post_dict["reactions"] or []
        else:
            reactions = []

        message_dict = dict(
            sender_id=sender_id,
            content=content,
            date_sent=int(post_dict["create_at"] / 1000),
            reactions=reactions,
        )
        if "channel" in post_dict:
            message_dict["channel_name"] = post_dict["channel"]
        elif "channel_members" in post_dict:
            # This case is for handling posts from PMs and huddles, not channels.
            # PMs and huddles are known as direct_channels in Slack and hence
            # the name channel_members.
            channel_members = post_dict["channel_members"]
            if len(channel_members) > 2:
                message_dict["huddle_name"] = generate_huddle_name(channel_members)
            elif len(channel_members) == 2:
                message_dict["pm_members"] = channel_members
        else:
            raise AssertionError("Post without channel or channel_members key.")

        if post_dict.get("attachments"):
            message_dict["attachments"] = post_dict["attachments"]

        return message_dict

    raw_messages = []
    for post_dict in post_data_list:
        raw_messages.append(message_to_dict(post_dict))
        message_replies = post_dict["replies"]
        # Replies to a message in Mattermost are stored in the main message object.
        # For now, we just append the replies immediately after the original message.
        if message_replies is not None:
            for reply in message_replies:
                if "channel" in post_dict:
                    reply["channel"] = post_dict["channel"]
                else:  # nocoverage
                    reply["channel_members"] = post_dict["channel_members"]
                raw_messages.append(message_to_dict(reply))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_id_mapper=user_id_mapper,
            user_handler=user_handler,
            get_recipient_id_from_receiver_name=get_recipient_id_from_receiver_name,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            zerver_realmemoji=zerver_realmemoji,
            total_reactions=total_reactions,
            uploads_list=uploads_list,
            zerver_attachment=zerver_attachment,
            mattermost_data_dir=mattermost_data_dir,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
def process_posts(team_name: str, realm_id: int, post_data: List[Dict[str,
                                                                      Any]],
                  get_recipient_id: Callable[[ZerverFieldsT], int],
                  subscriber_map: Dict[int, Set[int]], output_dir: str,
                  is_pm_data: bool, masking_content: bool,
                  user_id_mapper: IdMapper, user_handler: UserHandler,
                  username_to_user: Dict[str, Dict[str, Any]],
                  zerver_realmemoji: List[Dict[str, Any]],
                  total_reactions: List[Dict[str, Any]]) -> None:

    post_data_list = [d for d in post_data if d["team"] == team_name]

    def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]:
        sender_id = user_id_mapper.get(post_dict["user"])
        content = post_dict['message']

        if masking_content:
            content = re.sub('[a-z]', 'x', content)
            content = re.sub('[A-Z]', 'X', content)

        if "reactions" in post_dict:
            reactions = post_dict["reactions"] or []
        else:
            reactions = []

        return dict(sender_id=sender_id,
                    receiver_id=post_dict["channel"],
                    content=content,
                    pub_date=int(post_dict['create_at'] / 1000),
                    reactions=reactions)

    raw_messages = []
    for post_dict in post_data_list:
        raw_messages.append(message_to_dict(post_dict))
        message_replies = post_dict["replies"]
        # Replies to a message in Mattermost are stored in the main message object.
        # For now, we just append the replies immediately after the original message.
        if message_replies is not None:
            for reply in message_replies:
                reply["channel"] = post_dict["channel"]
                raw_messages.append(message_to_dict(reply))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_id_mapper=user_id_mapper,
            user_handler=user_handler,
            get_recipient_id=get_recipient_id,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            zerver_realmemoji=zerver_realmemoji,
            total_reactions=total_reactions,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 7
0
def process_posts(num_teams: int, team_name: str, realm_id: int,
                  post_data: List[Dict[str, Any]],
                  get_recipient_id_from_receiver_name: Callable[[str, int],
                                                                int],
                  subscriber_map: Dict[int, Set[int]], output_dir: str,
                  is_pm_data: bool, masking_content: bool,
                  user_id_mapper: IdMapper, user_handler: UserHandler,
                  username_to_user: Dict[str, Dict[str, Any]],
                  zerver_realmemoji: List[Dict[str, Any]],
                  total_reactions: List[Dict[str, Any]]) -> None:

    post_data_list = []
    for post in post_data:
        if "team" not in post:
            # Mattermost doesn't specify a team for private messages
            # in its export format.  This line of code requires that
            # we only be importing data from a single team (checked
            # elsewhere) -- we just assume it's the target team.
            post_team = team_name
        else:
            post_team = post["team"]
        if post_team == team_name:
            post_data_list.append(post)

    def message_to_dict(post_dict: Dict[str, Any]) -> Dict[str, Any]:
        sender_id = user_id_mapper.get(post_dict["user"])
        content = post_dict['message']

        if masking_content:
            content = re.sub('[a-z]', 'x', content)
            content = re.sub('[A-Z]', 'X', content)

        if "reactions" in post_dict:
            reactions = post_dict["reactions"] or []
        else:
            reactions = []

        return dict(sender_id=sender_id,
                    receiver_id=post_dict["channel"],
                    content=content,
                    date_sent=int(post_dict['create_at'] / 1000),
                    reactions=reactions)

    raw_messages = []
    for post_dict in post_data_list:
        raw_messages.append(message_to_dict(post_dict))
        message_replies = post_dict["replies"]
        # Replies to a message in Mattermost are stored in the main message object.
        # For now, we just append the replies immediately after the original message.
        if message_replies is not None:
            for reply in message_replies:
                reply["channel"] = post_dict["channel"]
                raw_messages.append(message_to_dict(reply))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_id_mapper=user_id_mapper,
            user_handler=user_handler,
            get_recipient_id_from_receiver_name=
            get_recipient_id_from_receiver_name,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            zerver_realmemoji=zerver_realmemoji,
            total_reactions=total_reactions,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 8
0
def process_messages(
    realm_id: int,
    messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    is_pm_data: bool,
    username_to_user_id_map: Dict[str, str],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    user_id_to_recipient_id: Dict[int, int],
    stream_id_mapper: IdMapper,
    stream_id_to_recipient_id: Dict[int, int],
    huddle_id_mapper: IdMapper,
    huddle_id_to_recipient_id: Dict[int, int],
    room_id_to_room_map: Dict[str, Dict[str, Any]],
    dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
    direct_id_to_direct_map: Dict[str, Dict[str, Any]],
    huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
    zerver_realmemoji: List[ZerverFieldsT],
    total_reactions: List[ZerverFieldsT],
    uploads_list: List[ZerverFieldsT],
    zerver_attachment: List[ZerverFieldsT],
    upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
    output_dir: str,
) -> None:
    def list_reactions(
            reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
        # List of dictionaries of form:
        # {"name": "smile", "user_id": 2}
        reactions_list: List[Dict[str, Any]] = []
        for react_code in reactions:
            name = react_code.split(":")[1]
            usernames = reactions[react_code]["usernames"]

            for username in usernames:
                rc_user_id = username_to_user_id_map[username]
                user_id = user_id_mapper.get(rc_user_id)
                reactions_list.append({"name": name, "user_id": user_id})

        return reactions_list

    def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]:
        rc_sender_id = message["u"]["_id"]
        sender_id = user_id_mapper.get(rc_sender_id)
        content = message["msg"]

        if message.get("reactions"):
            reactions = list_reactions(message["reactions"])
        else:
            reactions = []

        message_dict = dict(
            sender_id=sender_id,
            content=content,
            date_sent=int(message["ts"].timestamp()),
            reactions=reactions,
            has_link=True if message.get("urls") else False,
        )

        # Add recipient_id and topic to message_dict
        if is_pm_data:
            # Message is in a PM or a huddle.
            rc_channel_id = message["rid"]
            if rc_channel_id in huddle_id_to_huddle_map:
                huddle_id = huddle_id_mapper.get(rc_channel_id)
                message_dict["recipient_id"] = huddle_id_to_recipient_id[
                    huddle_id]
            else:
                rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"]
                if rc_sender_id == rc_member_ids[0]:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[1])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
                else:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[0])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
            # PMs and huddles don't have topics, but topic_name field is required in `build_message`.
            message_dict["topic_name"] = ""
        elif message["rid"] in dsc_id_to_dsc_map:
            # Message is in a discussion
            dsc_channel = dsc_id_to_dsc_map[message["rid"]]
            parent_channel_id = dsc_channel["prid"]
            stream_id = stream_id_mapper.get(parent_channel_id)
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]

            # In case you change this, please also change the topic name used
            # in discussion mention to topic mention conversion below, while
            # adding the Rocket.Chat channel mention data to message_dict.
            message_dict[
                "topic_name"] = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)'
        else:
            stream_id = stream_id_mapper.get(message["rid"])
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
            message_dict["topic_name"] = "Imported from Rocket.Chat"

        # Add user mentions to message_dict
        mention_user_ids = set()
        for mention in message.get("mentions", []):
            mention_id = mention["_id"]
            if mention_id in ["all", "here"]:
                continue
            user_id = user_id_mapper.get(mention_id)
            mention_user_ids.add(user_id)
        message_dict["mention_user_ids"] = mention_user_ids

        # Add channel mentions to message_dict
        rc_channel_mention_data: List[Dict[str, str]] = []
        for mention in message.get("channels", []):
            mention_rc_channel_id = mention["_id"]
            mention_rc_channel_name = mention["name"]
            rc_mention = f"#{mention_rc_channel_name}"

            if mention_rc_channel_id in room_id_to_room_map:
                # Channel is converted to a stream.
                converted_stream_name = mention_rc_channel_name

                rc_channel = room_id_to_room_map[mention_rc_channel_id]
                if rc_channel.get("teamMain") is True:
                    # Channel is a team's main channel
                    converted_stream_name = "[TEAM] " + converted_stream_name

                zulip_mention = f"#**{converted_stream_name}**"
            elif mention_rc_channel_id in dsc_id_to_dsc_map:
                # Channel is a discussion and is converted to a topic.
                dsc_channel = dsc_id_to_dsc_map[mention_rc_channel_id]
                parent_channel_id = dsc_channel["prid"]
                parent_rc_channel = room_id_to_room_map[parent_channel_id]

                converted_topic_name = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)'
                parent_stream_name = parent_rc_channel["name"]

                if parent_rc_channel.get("teamMain") is True:
                    # Parent channel is a team's main channel
                    parent_stream_name = "[TEAM] " + parent_stream_name

                zulip_mention = f"#**{parent_stream_name}>{converted_topic_name}**"

            mention_data = {
                "rc_mention": rc_mention,
                "zulip_mention": zulip_mention
            }
            rc_channel_mention_data.append(mention_data)
        message_dict["rc_channel_mention_data"] = rc_channel_mention_data

        # Add uploaded file (attachment) to message_dict
        if message.get("file"):
            message_dict["file"] = message["file"]

        return message_dict

    raw_messages: List[Dict[str, Any]] = []
    for message in messages:
        if message.get("t") is not None:
            # Messages with a type are system notifications like user_joined
            # that we don't include.
            continue
        raw_messages.append(message_to_dict(message))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_handler=user_handler,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            zerver_realmemoji=zerver_realmemoji,
            total_reactions=total_reactions,
            uploads_list=uploads_list,
            zerver_attachment=zerver_attachment,
            upload_id_to_upload_data_map=upload_id_to_upload_data_map,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 9
0
def process_messages(
    realm_id: int,
    messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    is_pm_data: bool,
    username_to_user_id_map: Dict[str, str],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    user_id_to_recipient_id: Dict[int, int],
    stream_id_mapper: IdMapper,
    stream_id_to_recipient_id: Dict[int, int],
    huddle_id_mapper: IdMapper,
    huddle_id_to_recipient_id: Dict[int, int],
    thread_id_mapper: IdMapper,
    room_id_to_room_map: Dict[str, Dict[str, Any]],
    dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
    direct_id_to_direct_map: Dict[str, Dict[str, Any]],
    huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
    zerver_realmemoji: List[ZerverFieldsT],
    total_reactions: List[ZerverFieldsT],
    uploads_list: List[ZerverFieldsT],
    zerver_attachment: List[ZerverFieldsT],
    upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
    output_dir: str,
) -> None:
    def list_reactions(
            reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
        # List of dictionaries of form:
        # {"name": "smile", "user_id": 2}
        reactions_list: List[Dict[str, Any]] = []
        for react_code in reactions:
            name = react_code.split(":")[1]
            usernames = reactions[react_code]["usernames"]

            for username in usernames:
                rc_user_id = username_to_user_id_map[username]
                user_id = user_id_mapper.get(rc_user_id)
                reactions_list.append({"name": name, "user_id": user_id})

        return reactions_list

    def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]:
        rc_sender_id = message["u"]["_id"]
        sender_id = user_id_mapper.get(rc_sender_id)
        content = message["msg"]

        if message.get("reactions"):
            reactions = list_reactions(message["reactions"])
        else:
            reactions = []

        message_dict = dict(
            sender_id=sender_id,
            content=content,
            date_sent=int(message["ts"].timestamp()),
            reactions=reactions,
            has_link=True if message.get("urls") else False,
        )

        # Add recipient_id to message_dict
        if is_pm_data:
            # Message is in a PM or a huddle.
            rc_channel_id = message["rid"]
            if rc_channel_id in huddle_id_to_huddle_map:
                huddle_id = huddle_id_mapper.get(rc_channel_id)
                message_dict["recipient_id"] = huddle_id_to_recipient_id[
                    huddle_id]
            else:
                rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"]
                if rc_sender_id == rc_member_ids[0]:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[1])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
                else:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[0])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
        elif message["rid"] in dsc_id_to_dsc_map:
            # Message is in a discussion
            dsc_channel = dsc_id_to_dsc_map[message["rid"]]
            parent_channel_id = dsc_channel["prid"]
            stream_id = stream_id_mapper.get(parent_channel_id)
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
        else:
            stream_id = stream_id_mapper.get(message["rid"])
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]

        # Add topic name to message_dict
        message_dict["topic_name"] = get_topic_name(message, dsc_id_to_dsc_map,
                                                    thread_id_mapper,
                                                    is_pm_data)

        # Add user mentions to message_dict
        mention_user_ids = set()
        wildcard_mention = False
        for mention in message.get("mentions", []):
            mention_id = mention["_id"]
            if mention_id in ["all", "here"]:
                wildcard_mention = True
                continue
            user_id = user_id_mapper.get(mention_id)
            mention_user_ids.add(user_id)
        message_dict["mention_user_ids"] = mention_user_ids
        message_dict["wildcard_mention"] = wildcard_mention

        # Add channel mentions to message_dict
        rc_channel_mention_data: List[Dict[str, str]] = []
        for mention in message.get("channels", []):
            mention_rc_channel_id = mention["_id"]
            mention_rc_channel_name = mention["name"]
            rc_mention = f"#{mention_rc_channel_name}"

            if mention_rc_channel_id in room_id_to_room_map:
                # Channel is converted to a stream.
                rc_channel = room_id_to_room_map[mention_rc_channel_id]
                converted_stream_name = get_stream_name(rc_channel)

                zulip_mention = f"#**{converted_stream_name}**"
            elif mention_rc_channel_id in dsc_id_to_dsc_map:
                # Channel is a discussion and is converted to a topic.
                dsc_channel = dsc_id_to_dsc_map[mention_rc_channel_id]
                parent_channel_id = dsc_channel["prid"]
                if (parent_channel_id in direct_id_to_direct_map
                        or parent_channel_id in huddle_id_to_huddle_map):
                    # Discussion belongs to a direct channel and thus, should not be
                    # linked.

                    # This logging statement serves the side benefit of avoiding the
                    # CPython optimization for `continue` so that the coverage reports
                    # aren't misleading.
                    logging.info(
                        "skipping direct messages discussion mention: %s",
                        dsc_channel["fname"])
                    continue

                converted_topic_name = get_topic_name(
                    message={"rid": mention_rc_channel_id},
                    dsc_id_to_dsc_map=dsc_id_to_dsc_map,
                    thread_id_mapper=thread_id_mapper,
                )

                parent_rc_channel = room_id_to_room_map[parent_channel_id]
                parent_stream_name = get_stream_name(parent_rc_channel)

                zulip_mention = f"#**{parent_stream_name}>{converted_topic_name}**"

            mention_data = {
                "rc_mention": rc_mention,
                "zulip_mention": zulip_mention
            }
            rc_channel_mention_data.append(mention_data)
        message_dict["rc_channel_mention_data"] = rc_channel_mention_data

        # Add uploaded file (attachment) to message_dict
        if message.get("file"):
            message_dict["file"] = message["file"]

        return message_dict

    raw_messages: List[Dict[str, Any]] = []
    for message in messages:
        if message.get("t") is not None:
            # Messages with a type are system notifications like user_joined
            # that we don't include.
            continue
        raw_messages.append(message_to_dict(message))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_handler=user_handler,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            zerver_realmemoji=zerver_realmemoji,
            total_reactions=total_reactions,
            uploads_list=uploads_list,
            zerver_attachment=zerver_attachment,
            upload_id_to_upload_data_map=upload_id_to_upload_data_map,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 10
0
def process_messages(
    realm_id: int,
    messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    is_pm_data: bool,
    username_to_user_id_map: Dict[str, str],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    user_id_to_recipient_id: Dict[int, int],
    stream_id_mapper: IdMapper,
    stream_id_to_recipient_id: Dict[int, int],
    huddle_id_mapper: IdMapper,
    huddle_id_to_recipient_id: Dict[int, int],
    dsc_id_to_dsc_map: Dict[str, Dict[str, Any]],
    direct_id_to_direct_map: Dict[str, Dict[str, Any]],
    huddle_id_to_huddle_map: Dict[str, Dict[str, Any]],
    total_reactions: List[ZerverFieldsT],
    output_dir: str,
) -> None:
    def list_reactions(
            reactions: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
        # List of dictionaries of form:
        # {"name": "smile", "user_id": 2}
        reactions_list: List[Dict[str, Any]] = []
        for react_code in reactions:
            name = react_code.split(":")[1]
            usernames = reactions[react_code]["usernames"]

            for username in usernames:
                rc_user_id = username_to_user_id_map[username]
                user_id = user_id_mapper.get(rc_user_id)
                reactions_list.append({"name": name, "user_id": user_id})

        return reactions_list

    def message_to_dict(message: Dict[str, Any]) -> Dict[str, Any]:
        rc_sender_id = message["u"]["_id"]
        sender_id = user_id_mapper.get(rc_sender_id)
        content = message["msg"]

        if message.get("reactions"):
            reactions = list_reactions(message["reactions"])
        else:
            reactions = []

        message_dict = dict(
            sender_id=sender_id,
            content=content,
            date_sent=int(message["ts"].timestamp()),
            reactions=reactions,
        )

        # Add recipient_id and topic to message_dict
        if is_pm_data:
            # Message is in a PM or a huddle.
            rc_channel_id = message["rid"]
            if rc_channel_id in huddle_id_to_huddle_map:
                huddle_id = huddle_id_mapper.get(rc_channel_id)
                message_dict["recipient_id"] = huddle_id_to_recipient_id[
                    huddle_id]
            else:
                rc_member_ids = direct_id_to_direct_map[rc_channel_id]["uids"]
                if rc_sender_id == rc_member_ids[0]:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[1])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
                else:
                    zulip_member_id = user_id_mapper.get(rc_member_ids[0])
                    message_dict["recipient_id"] = user_id_to_recipient_id[
                        zulip_member_id]
            # PMs and huddles don't have topics, but topic_name field is required in `build_message`.
            message_dict["topic_name"] = ""
        elif message["rid"] in dsc_id_to_dsc_map:
            # Message is in a discussion
            dsc_channel = dsc_id_to_dsc_map[message["rid"]]
            parent_channel_id = dsc_channel["prid"]
            stream_id = stream_id_mapper.get(parent_channel_id)
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
            message_dict[
                "topic_name"] = f'{dsc_channel["fname"]} (Imported from Rocket.Chat)'
        else:
            stream_id = stream_id_mapper.get(message["rid"])
            message_dict["recipient_id"] = stream_id_to_recipient_id[stream_id]
            message_dict["topic_name"] = "Imported from Rocket.Chat"

        # Add mentions to message_dict
        mention_user_ids = set()
        for mention in message.get("mentions", []):
            mention_id = mention["_id"]
            if mention_id in ["all", "here"]:
                continue
            user_id = user_id_mapper.get(mention_id)
            mention_user_ids.add(user_id)
        message_dict["mention_user_ids"] = mention_user_ids

        return message_dict

    raw_messages: List[Dict[str, Any]] = []
    for message in messages:
        if message.get("t") is not None:
            # Messages with a type are system notifications like user_joined
            # that we don't include.
            continue
        raw_messages.append(message_to_dict(message))

    def process_batch(lst: List[Dict[str, Any]]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            subscriber_map=subscriber_map,
            user_handler=user_handler,
            is_pm_data=is_pm_data,
            output_dir=output_dir,
            total_reactions=total_reactions,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )
Esempio n. 11
0
def process_message_file(realm_id: int, fn: str, fn_id: int, files_dir: str,
                         get_recipient_id: Callable[[ZerverFieldsT],
                                                    int], message_key: str,
                         zerver_subscription: List[ZerverFieldsT],
                         data_dir: str, output_dir: str,
                         user_handler: UserHandler,
                         attachment_handler: AttachmentHandler) -> None:
    def get_raw_messages(fn: str) -> List[ZerverFieldsT]:
        with open(fn) as f:
            data = ujson.load(f)

        flat_data = [d[message_key] for d in data if message_key in d]

        def get_raw_message(d: Dict[str, Any]) -> ZerverFieldsT:
            if isinstance(d['sender'], str):
                # Some Hipchat instances just give us a person's
                # name in the sender field for NotificationMessage.
                # We turn them into a mirror user.
                mirror_user = user_handler.get_mirror_user(
                    realm_id=realm_id,
                    name=d['sender'],
                )
                sender_id = mirror_user['id']
            else:
                sender_id = d['sender']['id']

            return dict(
                fn_id=fn_id,
                sender_id=sender_id,
                receiver_id=d.get('receiver', {}).get('id'),
                content=d['message'],
                mention_user_ids=d.get('mentions', []),
                pub_date=str_date_to_float(d['timestamp']),
                attachment=d.get('attachment'),
                files_dir=files_dir,
            )

        raw_messages = []

        for d in flat_data:
            raw_message = get_raw_message(d)
            raw_messages.append(raw_message)

        return raw_messages

    raw_messages = get_raw_messages(fn)

    def process_batch(lst: List[Any]) -> None:
        process_raw_message_batch(
            realm_id=realm_id,
            raw_messages=lst,
            zerver_subscription=zerver_subscription,
            user_handler=user_handler,
            attachment_handler=attachment_handler,
            get_recipient_id=get_recipient_id,
            output_dir=output_dir,
        )

    chunk_size = 1000

    process_list_in_batches(
        lst=raw_messages,
        chunk_size=chunk_size,
        process_batch=process_batch,
    )