Ejemplo n.º 1
0
def convert_gitter_workspace_messages(gitter_data: GitterDataT, output_dir: str,
                                      subscriber_map: Dict[int, Set[int]],
                                      user_map: Dict[str, int],
                                      user_short_name_to_full_name: Dict[str, str],
                                      chunk_size: int=MESSAGE_BATCH_CHUNK_SIZE) -> None:
    """
    Messages are stored in batches
    """
    logging.info('######### IMPORTING MESSAGES STARTED #########\n')
    message_id = 0
    recipient_id = 0  # Corresponding to stream "gitter"

    low_index = 0
    upper_index = low_index + chunk_size
    dump_file_id = 1

    while True:
        message_json = {}
        zerver_message = []
        zerver_usermessage = []  # type: List[ZerverFieldsT]
        message_data = gitter_data[low_index: upper_index]
        if len(message_data) == 0:
            break
        for message in message_data:
            message_time = dateutil.parser.parse(message['sent']).timestamp()
            mentioned_user_ids = get_usermentions(message, user_map,
                                                  user_short_name_to_full_name)
            rendered_content = None
            topic_name = 'imported from gitter'
            user_id = user_map[message['fromUser']['id']]

            zulip_message = build_message(topic_name, float(message_time), message_id, message['text'],
                                          rendered_content, user_id, recipient_id)
            zerver_message.append(zulip_message)

            build_usermessages(
                zerver_usermessage=zerver_usermessage,
                subscriber_map=subscriber_map,
                recipient_id=recipient_id,
                mentioned_user_ids=mentioned_user_ids,
                message_id=message_id,
            )

            message_id += 1

        message_json['zerver_message'] = zerver_message
        message_json['zerver_usermessage'] = zerver_usermessage
        message_filename = os.path.join(output_dir, "messages-%06d.json" % (dump_file_id,))
        logging.info("Writing Messages to %s\n" % (message_filename,))
        write_data_to_file(os.path.join(message_filename), message_json)

        low_index = upper_index
        upper_index = chunk_size + low_index
        dump_file_id += 1

    logging.info('######### IMPORTING MESSAGES FINISHED #########\n')
Ejemplo n.º 2
0
    def make_message(message_id: int, raw_message: ZerverFieldsT) -> ZerverFieldsT:
        # One side effect here:
        mention_map[message_id] = set(raw_message['mention_user_ids'])

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=raw_message['mention_user_ids'],
        )
        pub_date = raw_message['pub_date']
        recipient_id = get_recipient_id(raw_message)
        rendered_content = None
        subject = 'archived'
        user_id = raw_message['sender_id']

        return build_message(
            content=content,
            message_id=message_id,
            pub_date=pub_date,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            subject=subject,
            user_id=user_id,
        )
Ejemplo n.º 3
0
Archivo: slack.py Proyecto: kyoki/zulip
def channel_message_to_zerver_message(realm_id: int, users: List[ZerverFieldsT],
                                      added_users: AddedUsersT,
                                      added_recipient: AddedRecipientsT,
                                      all_messages: List[ZerverFieldsT],
                                      zerver_realmemoji: List[ZerverFieldsT],
                                      zerver_subscription: List[ZerverFieldsT],
                                      added_channels: AddedChannelsT,
                                      id_list: Tuple[int, int, int, int],
                                      domain_name: str) -> Tuple[List[ZerverFieldsT],
                                                                 List[ZerverFieldsT],
                                                                 List[ZerverFieldsT],
                                                                 List[ZerverFieldsT],
                                                                 List[ZerverFieldsT],
                                                                 Tuple[int, int, int, int]]:
    """
    Returns:
    1. zerver_message, which is a list of the messages
    2. zerver_usermessage, which is a list of the usermessages
    3. zerver_attachment, which is a list of the attachments
    4. uploads_list, which is a list of uploads to be mapped in uploads records.json
    5. reaction_list, which is a list of all user reactions
    6. id_list, which is a tuple of max ids of messages, usermessages, reactions and attachments
    """
    message_id_count, usermessage_id_count, reaction_id_count, attachment_id_count = id_list
    zerver_message = []
    zerver_usermessage = []  # type: List[ZerverFieldsT]
    uploads_list = []  # type: List[ZerverFieldsT]
    zerver_attachment = []  # type: List[ZerverFieldsT]
    reaction_list = []  # type: List[ZerverFieldsT]

    # For unicode emoji
    with open(NAME_TO_CODEPOINT_PATH) as fp:
        name_to_codepoint = ujson.load(fp)

    for message in all_messages:
        user = get_message_sending_user(message)
        if not user:
            # Ignore messages without user names
            # These are Sometimes produced by slack
            continue

        subtype = message.get('subtype', False)
        if subtype in [
                # Zulip doesn't have a pinned_item concept
                "pinned_item",
                "unpinned_item",
                # Slack's channel join/leave notices are spammy
                "channel_join",
                "channel_leave",
                "channel_name"
        ]:
            continue

        has_attachment = has_image = False
        try:
            content, mentioned_users_id, has_link = convert_to_zulip_markdown(
                message['text'], users, added_channels, added_users)
        except Exception:
            print("Slack message unexpectedly missing text representation:")
            print(json.dumps(message, indent=4))
            continue
        rendered_content = None

        recipient_id = added_recipient[message['channel_name']]
        message_id = message_id_count

        # Process message reactions
        if 'reactions' in message.keys():
            reaction_id_count = build_reactions(reaction_list, message['reactions'], added_users,
                                                message_id, reaction_id_count, name_to_codepoint,
                                                zerver_realmemoji)

        # Process different subtypes of slack messages

        # Subtypes which have only the action in the message should
        # be rendered with '/me' in the content initially
        # For example "sh_room_created" has the message 'started a call'
        # which should be displayed as '/me started a call'
        if subtype in ["bot_add", "sh_room_created", "me_message"]:
            content = ('/me %s' % (content))

        files = message.get('files', [])
        if subtype == 'file_share':
            # In Slack messages, uploads can either have the subtype as 'file_share' or
            # have the upload information in 'files' keyword
            files = [message['file']]

        for fileinfo in files:
            url = fileinfo['url_private']
            # For attachments with slack download link
            if 'files.slack.com' in url:
                has_attachment = has_link = True
                has_image = True if 'image' in fileinfo['mimetype'] else False

                file_user = [iterate_user for iterate_user in users if message['user'] == iterate_user['id']]
                file_user_email = get_user_email(file_user[0], domain_name)

                s3_path, content = get_attachment_path_and_content(fileinfo, realm_id)

                # construct attachments
                build_uploads(added_users[user], realm_id, file_user_email, fileinfo, s3_path,
                              uploads_list)

                attachment_id = attachment_id_count
                build_attachment(realm_id, message_id, attachment_id, added_users[user],
                                 fileinfo, s3_path, zerver_attachment)
                attachment_id_count += 1
            # For attachments with link not from slack
            # Example: Google drive integration
            else:
                has_link = True
                if 'title' in fileinfo:
                    file_name = fileinfo['title']
                else:
                    file_name = fileinfo['name']
                content = '[%s](%s)' % (file_name, fileinfo['url_private'])

        # construct message
        subject = 'imported from slack'

        zulip_message = build_message(subject, float(message['ts']), message_id, content,
                                      rendered_content, added_users[user], recipient_id,
                                      has_image, has_link, has_attachment)
        zerver_message.append(zulip_message)

        # construct usermessages
        usermessage_id_count = build_usermessages(
            zerver_usermessage, usermessage_id_count, zerver_subscription,
            recipient_id, mentioned_users_id, message_id)

        message_id_count += 1

    id_list = (message_id_count, usermessage_id_count,
               reaction_id_count, attachment_id_count)
    return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
        reaction_list, id_list
Ejemplo n.º 4
0
def process_raw_message_batch(realm_id: int, raw_messages: List[Dict[str,
                                                                     Any]],
                              subscriber_map: Dict[int, Set[int]],
                              user_id_mapper: IdMapper,
                              user_handler: UserHandler,
                              attachment_handler: AttachmentHandler,
                              get_recipient_id: Callable[[ZerverFieldsT], int],
                              is_pm_data: bool, output_dir: str) -> None:
    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            hipchat_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(hipchat_mention, zulip_mention)

        content = content.replace('@here', '@**all**')
        return content

    mention_map = dict()  # type: Dict[int, Set[int]]

    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    for raw_message in raw_messages:
        # One side effect here:

        message_id = NEXT_ID('message')
        mention_user_ids = {
            user_id_mapper.get(id)
            for id in set(raw_message['mention_user_ids'])
            if user_id_mapper.has(id)
        }
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:
            logging.info('skipping too-long message of length %s' %
                         (len(content), ))
            continue

        pub_date = raw_message['pub_date']

        try:
            recipient_id = get_recipient_id(raw_message)
        except KeyError:
            logging.debug(
                "Could not find recipient_id for a message, skipping.")
            continue

        rendered_content = None

        if is_pm_data:
            topic_name = ''
        else:
            topic_name = 'imported from hipchat'
        user_id = raw_message['sender_id']

        # Another side effect:
        extra_content = attachment_handler.handle_message_data(
            realm_id=realm_id,
            message_id=message_id,
            sender_id=user_id,
            attachment=raw_message['attachment'],
            files_dir=raw_message['files_dir'],
        )

        if extra_content:
            has_attachment = True
            content += '\n' + extra_content
        else:
            has_attachment = False

        message = build_message(
            content=content,
            message_id=message_id,
            pub_date=pub_date,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=user_id,
            has_attachment=has_attachment,
        )
        zerver_message.append(message)

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID('dump_file_id')
    message_file = "/messages-%06d.json" % (dump_file_id, )
    create_converted_data_files(message_json, output_dir, message_file)
Ejemplo n.º 5
0
def process_raw_message_batch(
    realm_id: int,
    raw_messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    get_recipient_id_from_receiver_name: Callable[[str, int], int],
    is_pm_data: bool,
    output_dir: str,
    zerver_realmemoji: List[Dict[str, Any]],
    total_reactions: List[Dict[str, Any]],
    uploads_list: List[ZerverFieldsT],
    zerver_attachment: List[ZerverFieldsT],
    mattermost_data_dir: str,
) -> None:
    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            mattermost_mention = "@{short_name}".format(**user)
            zulip_mention = "@**{full_name}**".format(**user)
            content = content.replace(mattermost_mention, zulip_mention)

        content = content.replace("@channel", "@**all**")
        content = content.replace("@all", "@**all**")
        # We don't have an equivalent for Mattermost's @here mention which mentions all users
        # online in the channel.
        content = content.replace("@here", "@**all**")
        return content

    mention_map: Dict[int, Set[int]] = {}
    zerver_message = []

    import html2text

    h = html2text.HTML2Text()

    pm_members = {}

    for raw_message in raw_messages:
        message_id = NEXT_ID("message")
        mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper)
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message["content"],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:  # nocoverage
            logging.info("skipping too-long message of length %s", len(content))
            continue

        date_sent = raw_message["date_sent"]
        sender_user_id = raw_message["sender_id"]
        if "channel_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(
                raw_message["channel_name"], Recipient.STREAM
            )
        elif "huddle_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(
                raw_message["huddle_name"], Recipient.HUDDLE
            )
        elif "pm_members" in raw_message:
            members = raw_message["pm_members"]
            member_ids = {user_id_mapper.get(member) for member in members}
            pm_members[message_id] = member_ids
            if sender_user_id == user_id_mapper.get(members[0]):
                recipient_id = get_recipient_id_from_receiver_name(members[1], Recipient.PERSONAL)
            else:
                recipient_id = get_recipient_id_from_receiver_name(members[0], Recipient.PERSONAL)
        else:
            raise AssertionError("raw_message without channel_name, huddle_name or pm_members key")

        rendered_content = None

        has_attachment = False
        has_image = False
        has_link = False
        if "attachments" in raw_message:
            has_attachment = True
            has_link = True

            attachment_markdown, has_image = process_message_attachments(
                attachments=raw_message["attachments"],
                realm_id=realm_id,
                message_id=message_id,
                user_id=sender_user_id,
                user_handler=user_handler,
                zerver_attachment=zerver_attachment,
                uploads_list=uploads_list,
                mattermost_data_dir=mattermost_data_dir,
                output_dir=output_dir,
            )

            content += attachment_markdown

        topic_name = "imported from mattermost"

        message = build_message(
            content=content,
            message_id=message_id,
            date_sent=date_sent,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=sender_user_id,
            has_image=has_image,
            has_link=has_link,
            has_attachment=has_attachment,
        )
        zerver_message.append(message)
        build_reactions(
            realm_id,
            total_reactions,
            raw_message["reactions"],
            message_id,
            user_id_mapper,
            zerver_realmemoji,
        )

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID("dump_file_id" + str(realm_id))
    message_file = f"/messages-{dump_file_id:06}.json"
    create_converted_data_files(message_json, output_dir, message_file)
Ejemplo n.º 6
0
def process_raw_message_batch(
    realm_id: int,
    raw_messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    user_id_mapper: IdMapper,
    user_handler: UserHandler,
    get_recipient_id: Callable[[ZerverFieldsT], int],
    is_pm_data: bool,
    output_dir: str,
    zerver_realmemoji: List[Dict[str, Any]],
    total_reactions: List[Dict[str, Any]],
) -> None:
    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            mattermost_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(mattermost_mention, zulip_mention)

        content = content.replace('@channel', '@**all**')
        content = content.replace('@all', '@**all**')
        # We don't have an equivalent for Mattermost's @here mention which mentions all users
        # online in the channel.
        content = content.replace('@here', '@**all**')
        return content

    mention_map = dict()  # type: Dict[int, Set[int]]
    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    name_to_codepoint = get_name_to_codepoint_dict()

    for raw_message in raw_messages:
        message_id = NEXT_ID('message')
        mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper)
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:  # nocoverage
            logging.info('skipping too-long message of length %s' %
                         (len(content), ))
            continue

        pub_date = raw_message['pub_date']
        try:
            recipient_id = get_recipient_id(raw_message)
        except KeyError:
            logging.debug(
                "Could not find recipient_id for a message, skipping.")
            continue

        rendered_content = None

        topic_name = 'imported from mattermost'
        user_id = raw_message['sender_id']

        message = build_message(
            content=content,
            message_id=message_id,
            pub_date=pub_date,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=user_id,
            has_attachment=False,
        )
        zerver_message.append(message)
        build_reactions(realm_id, total_reactions, raw_message["reactions"],
                        message_id, name_to_codepoint, user_id_mapper,
                        zerver_realmemoji)

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID('dump_file_id' + str(realm_id))
    message_file = "/messages-%06d.json" % (dump_file_id, )
    create_converted_data_files(message_json, output_dir, message_file)
Ejemplo n.º 7
0
def channel_message_to_zerver_message(
    realm_id: int, users: List[ZerverFieldsT],
    slack_user_id_to_zulip_user_id: SlackToZulipUserIDT,
    slack_recipient_name_to_zulip_recipient_id: SlackToZulipRecipientT,
    all_messages: List[ZerverFieldsT], zerver_realmemoji: List[ZerverFieldsT],
    subscriber_map: Dict[int, Set[int]], added_channels: AddedChannelsT,
    dm_members: DMMembersT, domain_name: str, long_term_idle: Set[int]
) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT],
           List[ZerverFieldsT], List[ZerverFieldsT]]:
    """
    Returns:
    1. zerver_message, which is a list of the messages
    2. zerver_usermessage, which is a list of the usermessages
    3. zerver_attachment, which is a list of the attachments
    4. uploads_list, which is a list of uploads to be mapped in uploads records.json
    5. reaction_list, which is a list of all user reactions
    """
    zerver_message = []
    zerver_usermessage = []  # type: List[ZerverFieldsT]
    uploads_list = []  # type: List[ZerverFieldsT]
    zerver_attachment = []  # type: List[ZerverFieldsT]
    reaction_list = []  # type: List[ZerverFieldsT]

    # For unicode emoji
    with open(NAME_TO_CODEPOINT_PATH) as fp:
        name_to_codepoint = ujson.load(fp)

    total_user_messages = 0
    total_skipped_user_messages = 0
    for message in all_messages:
        slack_user_id = get_message_sending_user(message)
        if not slack_user_id:
            # Ignore messages without slack_user_id
            # These are Sometimes produced by slack
            continue

        subtype = message.get('subtype', False)
        if subtype in [
                # Zulip doesn't have a pinned_item concept
                "pinned_item",
                "unpinned_item",
                # Slack's channel join/leave notices are spammy
                "channel_join",
                "channel_leave",
                "channel_name"
        ]:
            continue

        try:
            content, mentioned_user_ids, has_link = convert_to_zulip_markdown(
                message['text'], users, added_channels,
                slack_user_id_to_zulip_user_id)
        except Exception:
            print("Slack message unexpectedly missing text representation:")
            print(ujson.dumps(message, indent=4))
            continue
        rendered_content = None

        if "channel_name" in message:
            is_private = False
            recipient_id = slack_recipient_name_to_zulip_recipient_id[
                message['channel_name']]
        elif "mpim_name" in message:
            is_private = True
            recipient_id = slack_recipient_name_to_zulip_recipient_id[
                message['mpim_name']]
        elif "pm_name" in message:
            is_private = True
            sender = get_message_sending_user(message)
            members = dm_members[message['pm_name']]
            if sender == members[0]:
                recipient_id = slack_recipient_name_to_zulip_recipient_id[
                    members[1]]
                sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[
                    members[0]]
            else:
                recipient_id = slack_recipient_name_to_zulip_recipient_id[
                    members[0]]
                sender_recipient_id = slack_recipient_name_to_zulip_recipient_id[
                    members[1]]

        message_id = NEXT_ID('message')

        if 'reactions' in message.keys():
            build_reactions(reaction_list, message['reactions'],
                            slack_user_id_to_zulip_user_id, message_id,
                            name_to_codepoint, zerver_realmemoji)

        # Process different subtypes of slack messages

        # Subtypes which have only the action in the message should
        # be rendered with '/me' in the content initially
        # For example "sh_room_created" has the message 'started a call'
        # which should be displayed as '/me started a call'
        if subtype in ["bot_add", "sh_room_created", "me_message"]:
            content = '/me %s' % (content, )
        if subtype == 'file_comment':
            # The file_comment message type only indicates the
            # responsible user in a subfield.
            message['user'] = message['comment']['user']

        file_info = process_message_files(
            message=message,
            domain_name=domain_name,
            realm_id=realm_id,
            message_id=message_id,
            slack_user_id=slack_user_id,
            users=users,
            slack_user_id_to_zulip_user_id=slack_user_id_to_zulip_user_id,
            zerver_attachment=zerver_attachment,
            uploads_list=uploads_list,
        )

        content += file_info['content']
        has_link = has_link or file_info['has_link']

        has_attachment = file_info['has_attachment']
        has_image = file_info['has_image']

        topic_name = 'imported from slack'

        zulip_message = build_message(
            topic_name, float(message['ts']), message_id, content,
            rendered_content, slack_user_id_to_zulip_user_id[slack_user_id],
            recipient_id, has_image, has_link, has_attachment)
        zerver_message.append(zulip_message)

        (num_created, num_skipped) = build_usermessages(
            zerver_usermessage=zerver_usermessage,
            subscriber_map=subscriber_map,
            recipient_id=recipient_id,
            mentioned_user_ids=mentioned_user_ids,
            message_id=message_id,
            is_private=is_private,
            long_term_idle=long_term_idle,
        )
        total_user_messages += num_created
        total_skipped_user_messages += num_skipped

        if "pm_name" in message and recipient_id != sender_recipient_id:
            (num_created, num_skipped) = build_usermessages(
                zerver_usermessage=zerver_usermessage,
                subscriber_map=subscriber_map,
                recipient_id=sender_recipient_id,
                mentioned_user_ids=mentioned_user_ids,
                message_id=message_id,
                is_private=is_private,
                long_term_idle=long_term_idle,
            )
            total_user_messages += num_created
            total_skipped_user_messages += num_skipped

    logging.debug(
        "Created %s UserMessages; deferred %s due to long-term idle" %
        (total_user_messages, total_skipped_user_messages))
    return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
        reaction_list
Ejemplo n.º 8
0
def convert_gitter_workspace_messages(
        gitter_data: GitterDataT,
        output_dir: str,
        subscriber_map: Dict[int, Set[int]],
        user_map: Dict[str, int],
        user_short_name_to_full_name: Dict[str, str],
        chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE) -> None:
    """
    Messages are stored in batches
    """
    logging.info('######### IMPORTING MESSAGES STARTED #########\n')
    message_id = 0
    recipient_id = 0  # Corresponding to stream "gitter"

    low_index = 0
    upper_index = low_index + chunk_size
    dump_file_id = 1

    while True:
        message_json = {}
        zerver_message = []
        zerver_usermessage = []  # type: List[ZerverFieldsT]
        message_data = gitter_data[low_index:upper_index]
        if len(message_data) == 0:
            break
        for message in message_data:
            message_time = dateutil.parser.parse(message['sent']).timestamp()
            mentioned_user_ids = get_usermentions(
                message, user_map, user_short_name_to_full_name)
            rendered_content = None
            topic_name = 'imported from gitter'
            user_id = user_map[message['fromUser']['id']]

            zulip_message = build_message(topic_name, float(message_time),
                                          message_id, message['text'],
                                          rendered_content, user_id,
                                          recipient_id)
            zerver_message.append(zulip_message)

            build_usermessages(
                zerver_usermessage=zerver_usermessage,
                subscriber_map=subscriber_map,
                recipient_id=recipient_id,
                mentioned_user_ids=mentioned_user_ids,
                message_id=message_id,
                is_private=False,
            )

            message_id += 1

        message_json['zerver_message'] = zerver_message
        message_json['zerver_usermessage'] = zerver_usermessage
        message_filename = os.path.join(
            output_dir, "messages-%06d.json" % (dump_file_id, ))
        logging.info("Writing Messages to %s\n" % (message_filename, ))
        write_data_to_file(os.path.join(message_filename), message_json)

        low_index = upper_index
        upper_index = chunk_size + low_index
        dump_file_id += 1

    logging.info('######### IMPORTING MESSAGES FINISHED #########\n')
Ejemplo n.º 9
0
def process_raw_message_batch(realm_id: int,
                              raw_messages: List[Dict[str, Any]],
                              subscriber_map: Dict[int, Set[int]],
                              user_id_mapper: IdMapper,
                              user_handler: UserHandler,
                              get_recipient_id_from_receiver_name: Callable[[str, int], int],
                              is_pm_data: bool,
                              output_dir: str,
                              zerver_realmemoji: List[Dict[str, Any]],
                              total_reactions: List[Dict[str, Any]],
                              ) -> None:

    def fix_mentions(content: str, mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            mattermost_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(mattermost_mention, zulip_mention)

        content = content.replace('@channel', '@**all**')
        content = content.replace('@all', '@**all**')
        # We don't have an equivalent for Mattermost's @here mention which mentions all users
        # online in the channel.
        content = content.replace('@here', '@**all**')
        return content

    mention_map: Dict[int, Set[int]] = dict()
    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    pm_members = {}

    for raw_message in raw_messages:
        message_id = NEXT_ID('message')
        mention_user_ids = get_mentioned_user_ids(raw_message, user_id_mapper)
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
        content = h.handle(content)

        if len(content) > 10000:  # nocoverage
            logging.info('skipping too-long message of length %s' % (len(content),))
            continue

        date_sent = raw_message['date_sent']
        sender_user_id = raw_message['sender_id']
        if "channel_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(raw_message["channel_name"], Recipient.STREAM)
        elif "huddle_name" in raw_message:
            recipient_id = get_recipient_id_from_receiver_name(raw_message["huddle_name"], Recipient.HUDDLE)
        elif "pm_members" in raw_message:
            members = raw_message["pm_members"]
            member_ids = {user_id_mapper.get(member) for member in members}
            pm_members[message_id] = member_ids
            if sender_user_id == user_id_mapper.get(members[0]):
                recipient_id = get_recipient_id_from_receiver_name(members[1], Recipient.PERSONAL)
            else:
                recipient_id = get_recipient_id_from_receiver_name(members[0], Recipient.PERSONAL)
        else:
            raise AssertionError("raw_message without channel_name, huddle_name or pm_members key")

        rendered_content = None

        topic_name = 'imported from mattermost'

        message = build_message(
            content=content,
            message_id=message_id,
            date_sent=date_sent,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=sender_user_id,
            has_attachment=False,
        )
        zerver_message.append(message)
        build_reactions(realm_id, total_reactions, raw_message["reactions"], message_id,
                        user_id_mapper, zerver_realmemoji)

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID('dump_file_id' + str(realm_id))
    message_file = "/messages-%06d.json" % (dump_file_id,)
    create_converted_data_files(message_json, output_dir, message_file)
Ejemplo n.º 10
0
def process_raw_message_batch(
    realm_id: int,
    raw_messages: List[Dict[str, Any]],
    subscriber_map: Dict[int, Set[int]],
    user_handler: UserHandler,
    is_pm_data: bool,
    output_dir: str,
    zerver_realmemoji: List[ZerverFieldsT],
    total_reactions: List[ZerverFieldsT],
    uploads_list: List[ZerverFieldsT],
    zerver_attachment: List[ZerverFieldsT],
    upload_id_to_upload_data_map: Dict[str, Dict[str, Any]],
) -> None:
    def fix_mentions(content: str, mention_user_ids: Set[int],
                     rc_channel_mention_data: List[Dict[str, str]]) -> str:
        # Fix user mentions
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            rc_mention = "@{short_name}".format(**user)
            zulip_mention = "@**{full_name}**".format(**user)
            content = content.replace(rc_mention, zulip_mention)

        content = content.replace("@all", "@**all**")
        # We don't have an equivalent for Rocket.Chat's @here mention
        # which mentions all users active in the channel.
        content = content.replace("@here", "@**all**")

        # Fix channel mentions
        for mention_data in rc_channel_mention_data:
            rc_mention = mention_data["rc_mention"]
            zulip_mention = mention_data["zulip_mention"]
            content = content.replace(rc_mention, zulip_mention)

        return content

    mention_map: Dict[int, Set[int]] = {}
    zerver_message: List[ZerverFieldsT] = []

    for raw_message in raw_messages:
        message_id = NEXT_ID("message")
        mention_user_ids = raw_message["mention_user_ids"]
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message["content"],
            mention_user_ids=mention_user_ids,
            rc_channel_mention_data=raw_message["rc_channel_mention_data"],
        )

        if len(content) > 10000:  # nocoverage
            logging.info("skipping too-long message of length %s",
                         len(content))
            continue

        date_sent = raw_message["date_sent"]
        sender_user_id = raw_message["sender_id"]
        recipient_id = raw_message["recipient_id"]

        rendered_content = None

        has_attachment = False
        has_image = False
        has_link = raw_message["has_link"]

        if "file" in raw_message:
            has_attachment = True
            has_link = True

            attachment_content, has_image = process_message_attachment(
                upload=raw_message["file"],
                realm_id=realm_id,
                message_id=message_id,
                user_id=sender_user_id,
                user_handler=user_handler,
                uploads_list=uploads_list,
                zerver_attachment=zerver_attachment,
                upload_id_to_upload_data_map=upload_id_to_upload_data_map,
                output_dir=output_dir,
            )

            content += attachment_content

        topic_name = raw_message["topic_name"]

        message = build_message(
            content=content,
            message_id=message_id,
            date_sent=date_sent,
            recipient_id=recipient_id,
            rendered_content=rendered_content,
            topic_name=topic_name,
            user_id=sender_user_id,
            has_image=has_image,
            has_link=has_link,
            has_attachment=has_attachment,
        )
        zerver_message.append(message)
        build_reactions(
            total_reactions=total_reactions,
            reactions=raw_message["reactions"],
            message_id=message_id,
            zerver_realmemoji=zerver_realmemoji,
        )

    zerver_usermessage = make_user_messages(
        zerver_message=zerver_message,
        subscriber_map=subscriber_map,
        is_pm_data=is_pm_data,
        mention_map=mention_map,
    )

    message_json = dict(
        zerver_message=zerver_message,
        zerver_usermessage=zerver_usermessage,
    )

    dump_file_id = NEXT_ID("dump_file_id" + str(realm_id))
    message_file = f"/messages-{dump_file_id:06}.json"
    create_converted_data_files(message_json, output_dir, message_file)
Ejemplo n.º 11
0
def channel_message_to_zerver_message(
    realm_id: int, users: List[ZerverFieldsT], added_users: AddedUsersT,
    added_recipient: AddedRecipientsT, all_messages: List[ZerverFieldsT],
    zerver_realmemoji: List[ZerverFieldsT],
    zerver_subscription: List[ZerverFieldsT], added_channels: AddedChannelsT,
    id_list: Tuple[int, int, int, int], domain_name: str
) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT],
           List[ZerverFieldsT], List[ZerverFieldsT], Tuple[int, int, int,
                                                           int]]:
    """
    Returns:
    1. zerver_message, which is a list of the messages
    2. zerver_usermessage, which is a list of the usermessages
    3. zerver_attachment, which is a list of the attachments
    4. uploads_list, which is a list of uploads to be mapped in uploads records.json
    5. reaction_list, which is a list of all user reactions
    6. id_list, which is a tuple of max ids of messages, usermessages, reactions and attachments
    """
    message_id_count, usermessage_id_count, reaction_id_count, attachment_id_count = id_list
    zerver_message = []
    zerver_usermessage = []  # type: List[ZerverFieldsT]
    uploads_list = []  # type: List[ZerverFieldsT]
    zerver_attachment = []  # type: List[ZerverFieldsT]
    reaction_list = []  # type: List[ZerverFieldsT]

    # For unicode emoji
    with open(NAME_TO_CODEPOINT_PATH) as fp:
        name_to_codepoint = ujson.load(fp)

    for message in all_messages:
        user = get_message_sending_user(message)
        if not user:
            # Ignore messages without user names
            # These are Sometimes produced by slack
            continue

        subtype = message.get('subtype', False)
        if subtype in [
                # Zulip doesn't have a pinned_item concept
                "pinned_item",
                "unpinned_item",
                # Slack's channel join/leave notices are spammy
                "channel_join",
                "channel_leave",
                "channel_name"
        ]:
            continue

        has_attachment = has_image = False
        try:
            content, mentioned_users_id, has_link = convert_to_zulip_markdown(
                message['text'], users, added_channels, added_users)
        except Exception:
            print("Slack message unexpectedly missing text representation:")
            print(json.dumps(message, indent=4))
            continue
        rendered_content = None

        recipient_id = added_recipient[message['channel_name']]
        message_id = message_id_count

        # Process message reactions
        if 'reactions' in message.keys():
            reaction_id_count = build_reactions(
                reaction_list, message['reactions'], added_users, message_id,
                reaction_id_count, name_to_codepoint, zerver_realmemoji)

        # Process different subtypes of slack messages

        # Subtypes which have only the action in the message should
        # be rendered with '/me' in the content initially
        # For example "sh_room_created" has the message 'started a call'
        # which should be displayed as '/me started a call'
        if subtype in ["bot_add", "sh_room_created", "me_message"]:
            content = ('/me %s' % (content))

        files = message.get('files', [])
        if subtype == 'file_share':
            # In Slack messages, uploads can either have the subtype as 'file_share' or
            # have the upload information in 'files' keyword
            files = [message['file']]

        for fileinfo in files:
            url = fileinfo['url_private']
            # For attachments with slack download link
            if 'files.slack.com' in url:
                has_attachment = has_link = True
                has_image = True if 'image' in fileinfo['mimetype'] else False

                file_user = [
                    iterate_user for iterate_user in users
                    if message['user'] == iterate_user['id']
                ]
                file_user_email = get_user_email(file_user[0], domain_name)

                s3_path, content = get_attachment_path_and_content(
                    fileinfo, realm_id)

                # construct attachments
                build_uploads(added_users[user], realm_id, file_user_email,
                              fileinfo, s3_path, uploads_list)

                attachment_id = attachment_id_count
                build_attachment(realm_id, message_id, attachment_id,
                                 added_users[user], fileinfo, s3_path,
                                 zerver_attachment)
                attachment_id_count += 1
            # For attachments with link not from slack
            # Example: Google drive integration
            else:
                has_link = True
                if 'title' in fileinfo:
                    file_name = fileinfo['title']
                else:
                    file_name = fileinfo['name']
                content = '[%s](%s)' % (file_name, fileinfo['url_private'])

        # construct message
        subject = 'imported from slack'

        zulip_message = build_message(subject, float(message['ts']),
                                      message_id, content, rendered_content,
                                      added_users[user], recipient_id,
                                      has_image, has_link, has_attachment)
        zerver_message.append(zulip_message)

        # construct usermessages
        usermessage_id_count = build_usermessages(
            zerver_usermessage, usermessage_id_count, zerver_subscription,
            recipient_id, mentioned_users_id, message_id)

        message_id_count += 1

    id_list = (message_id_count, usermessage_id_count, reaction_id_count,
               attachment_id_count)
    return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
        reaction_list, id_list
Ejemplo n.º 12
0
def channel_message_to_zerver_message(realm_id: int,
                                      users: List[ZerverFieldsT],
                                      added_users: AddedUsersT,
                                      added_recipient: AddedRecipientsT,
                                      all_messages: List[ZerverFieldsT],
                                      zerver_realmemoji: List[ZerverFieldsT],
                                      subscriber_map: Dict[int, Set[int]],
                                      added_channels: AddedChannelsT,
                                      domain_name: str,
                                      long_term_idle: Set[int]) -> Tuple[List[ZerverFieldsT],
                                                                         List[ZerverFieldsT],
                                                                         List[ZerverFieldsT],
                                                                         List[ZerverFieldsT],
                                                                         List[ZerverFieldsT]]:
    """
    Returns:
    1. zerver_message, which is a list of the messages
    2. zerver_usermessage, which is a list of the usermessages
    3. zerver_attachment, which is a list of the attachments
    4. uploads_list, which is a list of uploads to be mapped in uploads records.json
    5. reaction_list, which is a list of all user reactions
    """
    zerver_message = []
    zerver_usermessage = []  # type: List[ZerverFieldsT]
    uploads_list = []  # type: List[ZerverFieldsT]
    zerver_attachment = []  # type: List[ZerverFieldsT]
    reaction_list = []  # type: List[ZerverFieldsT]

    # For unicode emoji
    with open(NAME_TO_CODEPOINT_PATH) as fp:
        name_to_codepoint = ujson.load(fp)

    total_user_messages = 0
    total_skipped_user_messages = 0
    for message in all_messages:
        user = get_message_sending_user(message)
        if not user:
            # Ignore messages without user names
            # These are Sometimes produced by slack
            continue

        subtype = message.get('subtype', False)
        if subtype in [
                # Zulip doesn't have a pinned_item concept
                "pinned_item",
                "unpinned_item",
                # Slack's channel join/leave notices are spammy
                "channel_join",
                "channel_leave",
                "channel_name"
        ]:
            continue

        try:
            content, mentioned_user_ids, has_link = convert_to_zulip_markdown(
                message['text'], users, added_channels, added_users)
        except Exception:
            print("Slack message unexpectedly missing text representation:")
            print(ujson.dumps(message, indent=4))
            continue
        rendered_content = None

        recipient_id = added_recipient[message['channel_name']]
        message_id = NEXT_ID('message')

        # Process message reactions
        if 'reactions' in message.keys():
            build_reactions(reaction_list, message['reactions'], added_users,
                            message_id, name_to_codepoint,
                            zerver_realmemoji)

        # Process different subtypes of slack messages

        # Subtypes which have only the action in the message should
        # be rendered with '/me' in the content initially
        # For example "sh_room_created" has the message 'started a call'
        # which should be displayed as '/me started a call'
        if subtype in ["bot_add", "sh_room_created", "me_message"]:
            content = ('/me %s' % (content))
        if subtype == 'file_comment':
            # The file_comment message type only indicates the
            # responsible user in a subfield.
            message['user'] = message['comment']['user']

        file_info = process_message_files(
            message=message,
            domain_name=domain_name,
            realm_id=realm_id,
            message_id=message_id,
            user=user,
            users=users,
            added_users=added_users,
            zerver_attachment=zerver_attachment,
            uploads_list=uploads_list,
        )

        content += file_info['content']
        has_link = has_link or file_info['has_link']

        has_attachment = file_info['has_attachment']
        has_image = file_info['has_image']

        # construct message
        topic_name = 'imported from slack'

        zulip_message = build_message(topic_name, float(message['ts']), message_id, content,
                                      rendered_content, added_users[user], recipient_id,
                                      has_image, has_link, has_attachment)
        zerver_message.append(zulip_message)

        # construct usermessages
        (num_created, num_skipped) = build_usermessages(
            zerver_usermessage=zerver_usermessage,
            subscriber_map=subscriber_map,
            recipient_id=recipient_id,
            mentioned_user_ids=mentioned_user_ids,
            message_id=message_id,
            long_term_idle=long_term_idle,
        )
        total_user_messages += num_created
        total_skipped_user_messages += num_skipped

    logging.debug("Created %s UserMessages; deferred %s due to long-term idle" % (
        total_user_messages, total_skipped_user_messages))
    return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
        reaction_list
Ejemplo n.º 13
0
def channel_message_to_zerver_message(
    realm_id: int, users: List[ZerverFieldsT], added_users: AddedUsersT,
    added_recipient: AddedRecipientsT, all_messages: List[ZerverFieldsT],
    zerver_realmemoji: List[ZerverFieldsT], subscriber_map: Dict[int,
                                                                 Set[int]],
    added_channels: AddedChannelsT, domain_name: str
) -> Tuple[List[ZerverFieldsT], List[ZerverFieldsT], List[ZerverFieldsT],
           List[ZerverFieldsT], List[ZerverFieldsT]]:
    """
    Returns:
    1. zerver_message, which is a list of the messages
    2. zerver_usermessage, which is a list of the usermessages
    3. zerver_attachment, which is a list of the attachments
    4. uploads_list, which is a list of uploads to be mapped in uploads records.json
    5. reaction_list, which is a list of all user reactions
    """
    zerver_message = []
    zerver_usermessage = []  # type: List[ZerverFieldsT]
    uploads_list = []  # type: List[ZerverFieldsT]
    zerver_attachment = []  # type: List[ZerverFieldsT]
    reaction_list = []  # type: List[ZerverFieldsT]

    # For unicode emoji
    with open(NAME_TO_CODEPOINT_PATH) as fp:
        name_to_codepoint = ujson.load(fp)

    for message in all_messages:
        user = get_message_sending_user(message)
        if not user:
            # Ignore messages without user names
            # These are Sometimes produced by slack
            continue

        subtype = message.get('subtype', False)
        if subtype in [
                # Zulip doesn't have a pinned_item concept
                "pinned_item",
                "unpinned_item",
                # Slack's channel join/leave notices are spammy
                "channel_join",
                "channel_leave",
                "channel_name"
        ]:
            continue

        try:
            content, mentioned_user_ids, has_link = convert_to_zulip_markdown(
                message['text'], users, added_channels, added_users)
        except Exception:
            print("Slack message unexpectedly missing text representation:")
            print(ujson.dumps(message, indent=4))
            continue
        rendered_content = None

        recipient_id = added_recipient[message['channel_name']]
        message_id = NEXT_ID('message')

        # Process message reactions
        if 'reactions' in message.keys():
            build_reactions(reaction_list, message['reactions'], added_users,
                            message_id, name_to_codepoint, zerver_realmemoji)

        # Process different subtypes of slack messages

        # Subtypes which have only the action in the message should
        # be rendered with '/me' in the content initially
        # For example "sh_room_created" has the message 'started a call'
        # which should be displayed as '/me started a call'
        if subtype in ["bot_add", "sh_room_created", "me_message"]:
            content = ('/me %s' % (content))
        if subtype == 'file_comment':
            # The file_comment message type only indicates the
            # responsible user in a subfield.
            message['user'] = message['comment']['user']

        file_info = process_message_files(
            message=message,
            domain_name=domain_name,
            realm_id=realm_id,
            message_id=message_id,
            user=user,
            users=users,
            added_users=added_users,
            zerver_attachment=zerver_attachment,
            uploads_list=uploads_list,
        )

        content += file_info['content']
        has_link = has_link or file_info['has_link']

        has_attachment = file_info['has_attachment']
        has_image = file_info['has_image']

        # construct message
        topic_name = 'imported from slack'

        zulip_message = build_message(topic_name, float(message['ts']),
                                      message_id, content, rendered_content,
                                      added_users[user], recipient_id,
                                      has_image, has_link, has_attachment)
        zerver_message.append(zulip_message)

        # construct usermessages
        build_usermessages(
            zerver_usermessage=zerver_usermessage,
            subscriber_map=subscriber_map,
            recipient_id=recipient_id,
            mentioned_user_ids=mentioned_user_ids,
            message_id=message_id,
        )

    return zerver_message, zerver_usermessage, zerver_attachment, uploads_list, \
        reaction_list
Ejemplo n.º 14
0
def convert_gitter_workspace_messages(
    gitter_data: GitterDataT,
    output_dir: str,
    subscriber_map: Dict[int, Set[int]],
    user_map: Dict[str, int],
    stream_map: Dict[str, int],
    user_short_name_to_full_name: Dict[str, str],
    chunk_size: int = MESSAGE_BATCH_CHUNK_SIZE,
) -> None:
    """
    Messages are stored in batches
    """
    logging.info("######### IMPORTING MESSAGES STARTED #########\n")
    message_id = 0

    low_index = 0
    upper_index = low_index + chunk_size
    dump_file_id = 1

    while True:
        message_json = {}
        zerver_message = []
        zerver_usermessage: List[ZerverFieldsT] = []
        message_data = gitter_data[low_index:upper_index]
        if len(message_data) == 0:
            break
        for message in message_data:
            message_time = dateutil.parser.parse(message["sent"]).timestamp()
            mentioned_user_ids = get_usermentions(
                message, user_map, user_short_name_to_full_name)
            rendered_content = None
            topic_name = "imported from Gitter" + (
                f' room {message["room"]}' if "room" in message else "")
            user_id = user_map[message["fromUser"]["id"]]
            recipient_id = stream_map[
                message["room"]] if "room" in message else 0
            zulip_message = build_message(
                topic_name,
                float(message_time),
                message_id,
                message["text"],
                rendered_content,
                user_id,
                recipient_id,
            )
            zerver_message.append(zulip_message)

            build_usermessages(
                zerver_usermessage=zerver_usermessage,
                subscriber_map=subscriber_map,
                recipient_id=recipient_id,
                mentioned_user_ids=mentioned_user_ids,
                message_id=message_id,
                is_private=False,
            )

            message_id += 1

        message_json["zerver_message"] = zerver_message
        message_json["zerver_usermessage"] = zerver_usermessage
        message_filename = os.path.join(output_dir,
                                        f"messages-{dump_file_id:06}.json")
        logging.info("Writing messages to %s\n", message_filename)
        write_data_to_file(os.path.join(message_filename), message_json)

        low_index = upper_index
        upper_index = chunk_size + low_index
        dump_file_id += 1

    logging.info("######### IMPORTING MESSAGES FINISHED #########\n")