Esempio n. 1
0
    def _fetch_messages_from_server(self, peer, buffer):
        """ Retrieves a number (100) of messages from Telegram's DC and adds them to 'buffer'.
            :param peer:        Chat/Channel object
            :param buffer:      buffer where to place retrieved messages

            :return
                latest_message_id The latest/biggest Message ID that sucessfully went into buffer.
        """
        messages = []

        # First retrieve the messages and some information
        # make 5 attempts
        for _ in range(0, 5):
            try:
                # NOTE: Telethon will make 5 attempts to reconnect
                # before failing
                messages = self.get_messages(peer,
                                             limit=100,
                                             offset_id=self.id_offset)

                if messages.total > 0 and messages:
                    sprint('Processing messages with ids {}-{} ...'.format(
                        messages[0].id, messages[-1].id))
            except FloodWaitError as ex:
                sprint(
                    'FloodWaitError detected. Sleep for {} sec before reconnecting! \n'
                    .format(ex.seconds))
                sleep(ex.seconds)
                self._init_connect()
                continue
            break

        latest_message_id = -1 \
            if not messages or self.settings.last_message_id >= messages[0].id \
            else messages[0].id

        # Iterate over all (in reverse order so the latest appear
        # the last in the console) and print them with format provided by exporter.
        for msg in messages:
            self.exporter_context.is_first_record = True \
                if self.msg_count_to_process == 1 \
                else False

            if self.settings.last_message_id >= msg.id:
                self.msg_count_to_process = 0
                break

            msg_dump_str = self.exporter.format(msg, self.exporter_context)

            buffer.append(msg_dump_str)

            self.msg_count_to_process -= 1
            self.id_offset = msg.id
            self.exporter_context.is_last_record = False
            if self.msg_count_to_process == 0:
                break

        return latest_message_id
 def _check_preconditions(self):
     """ Check preconditions before processing data """
     out_file_path = self.settings.out_file
     if self.settings.is_incremental_mode:
         # In incremental mode
         sprint('Switching to incremental mode.')
         self.logger.debug('Checking if output file exists.')
         if not os.path.exists(out_file_path):
             raise DumpingError(
                 'Error: Output file does not exist. Path="' +
                 out_file_path + '"')
         sprint('Dumping messages newer than {} using "{}" dumper.'.format(
             self.settings.last_message_id, self.settings.exporter))
     else:
         # In NONE-incremental mode
         if os.path.exists(out_file_path):
             sprint('Warning: The output file already exists.')
             if not self._is_user_confirmed(
                     'Are you sure you want to overwrite it? [y/n]'):
                 raise DumpingError("Terminating on user's request...")
         # Check if output file can be created/overwritten
         try:
             with open(out_file_path, mode='w+'):
                 pass
         except OSError as ex:
             raise DumpingError(
                 'Output file path "{}" is invalid. {}'.format(
                     out_file_path, ex.strerror))
         sprint('Dumping {} messages into "{}" file ...'.format(
             'all' if self.msg_count_to_process == sys.maxsize else
             self.msg_count_to_process, out_file_path))
Esempio n. 3
0
def _load_exporter(exporter_name):
    """ Loads exporter from file <exporter_name>.py in ./exporters subfolder.
        :param exporter_name:      name of exporter. E.g. 'text' or 'json'

        :return: Exporter instance
    """
    # By convention exporters are located in .\exporters subfolder
    # COMMENT: Don't check file existance. It won't play well with pyinstaller bins
    exporter_file_name = exporter_name + ".py"
    exporter_rel_name = "telegram_messages_dump.exporters." + exporter_name
    # Load exporter from file
    sprint("Try to load exporter '%s'...  " % (exporter_file_name), end='')
    try:
        exporter_module = importlib.import_module(exporter_rel_name)
        sprint("OK!")
    except ModuleNotFoundError:
        sprint("\nERROR: Failed to load exporter './exporters/%s'." % exporter_file_name)
        exit(1)

    try:
        exporterClass = getattr(exporter_module, exporter_name)
    except AttributeError:
        sprint("ERROR: Failed to load class '%s' out of './exporters/%s'." \
               % (exporter_name, exporter_file_name))
        exit(1)

    return exporterClass()
    def run(self):
        """ Dumps all desired chat messages into a file """
        # Resolve chat name into id
        peer = self(ResolveUsernameRequest(self.settings.chat_name))

        if peer.chats is None or not peer.chats:
            raise ValueError('Error: failed to resolve chat name into chat_id')

        chat = peer.chats[0]

        sprint('Chat name @{} resolved into channel id={}'.format(
            self.settings.chat_name, chat.id))

        # Dump history to file
        count = self.dump_messages_in_file(chat)

        if self.settings.is_clean:
            try:
                # TODO self.log_out()
                sprint('Session data cleared.')
            # pylint: disable=broad-except
            except Exception:
                sprint('Error: failed to log-out.')

        sprint(
            '{} messages were successfully written in resulting file. Done!'.
            format(count))
    def init_connect(self):
        """ Connect to the Telegram server and Authenticate. """
        sprint('Connecting to Telegram servers...')
        if not self.connect():
            sprint('Initial connection failed. Retrying...')
            if not self.connect():
                sprint('Could not connect to Telegram servers.')
                return

        # Then, ensure we're authorized and have access
        if not self.is_user_authorized():
            sprint('First run. Sending code request...')
            self.send_code_request(self.settings.phone_num)

            self_user = None
            while self_user is None:
                code = input('Enter the code you just received: ')
                try:
                    self_user = self.sign_in(self.settings.phone_num, code)

                # Two-step verification may be enabled
                except SessionPasswordNeededError:
                    pw = getpass("Two step verification is enabled. "
                                 "Please enter your password: ")

                    self_user = self.sign_in(password=pw)
    def retrieve_message_history(self, peer, buffer):
        """ Retrieves a number (100) of messages from Telegram's DC and adds them to 'buffer'.
            :param peer:        Chat/Channel object
            :param msg_count:   number of messages to process
            :param id_offset:   current message id to start with
            :param buffer:      buffer where to place retrieved messages

            :return
                msg_count - retrived_msg_count
                id_offset - the id of the last message retrieved
        """
        messages = []

        # First retrieve the messages and some information
        # make 5 attempts
        for _ in range(0, 5):
            try:
                messages = self.get_message_history(peer,
                                                    limit=100,
                                                    offset_id=self.id_offset)

                if messages.total > 0 and messages:
                    sprint('Processing messages with ids {}-{} ...'.format(
                        messages[0].id, messages[-1].id))
            except FloodWaitError as ex:
                sprint(
                    'FloodWaitError detected. Sleep for {} sec before reconnecting! \n'
                    .format(ex.seconds))
                sleep(ex.seconds)
                self.init_connect()
                continue
            break

        # Iterate over all (in reverse order so the latest appear
        # the last in the console) and print them with format provided by exporter.
        for msg in messages:
            self.exporter_context.is_first_record = True if self.msg_count_to_process == 1 else False
            msg_dump_str = self.exporter.format(msg, self.exporter_context)

            buffer.append(msg_dump_str)

            self.msg_count_to_process -= 1
            self.id_offset = msg.id
            self.exporter_context.is_last_record = False
            if self.msg_count_to_process == 0:
                break

        return
    def __init__(self, session_user_id, settings, exporter):

        sprint('Initializing session...')
        super().__init__(session_user_id,
                         settings.api_id,
                         settings.api_hash,
                         connection_mode=ConnectionMode.TCP_FULL,
                         proxy=None,
                         update_workers=1)

        self.settings = settings
        self.exporter = exporter
        self.exporter_context = ExporterContext()
        self.msg_count_to_process = 0
        self.id_offset = 0
        self.init_connect()
Esempio n. 8
0
def _load_exporter(exporter_name):
    """ Loads exporter from file <exporter_name>.py in ./exporters subfolder.
        :param exporter_name:      name of exporter. E.g. 'text' or 'json'

        :return: Exporter instance
    """
    # By convention exporters are located in .\exporters subfolder
    # COMMENT: Don't check file existance. It won't play well with with pyinstaller bins
    exporter_file_name = exporter_name + ".py"
    exporter_rel_name = "telegram_messages_dump.exporters." + exporter_name
    # Load exporter from file
    sprint("Try to load exporter '%s'...  " % (exporter_file_name), end='')
    exporter_module = importlib.import_module(exporter_rel_name)
    sprint("OK!")
    exporterClass = getattr(exporter_module, exporter_name)
    return exporterClass()
Esempio n. 9
0
    def send_to_repository(self, content):
        today = date.today().strftime("%Y-%m-%d")
        # by https://github.com/settings/tokens
        token = os.getenv('GITHUB_TOKEN')

        file_path = f"content/sample-posts/{today}/index.md"
        commit_msg = "upload posts"

        request_path = f'https://api.github.com/repos/pvpshoot/urbantrip/contents/{file_path}'
        file_content = base64.b64encode(
            content.encode("utf-8")).decode("utf-8")
        data = {"message": commit_msg, "content": file_content}
        auth = {"access_token": token}

        r = requests.put(request_path, json=data, params=auth)

        if r.status_code == 201:
            sprint(f"upload file: {file_path}")
        else:
            sprint(f"failed upload: {r.text}")
Esempio n. 10
0
def main():
    """ Entry point. """
    settings = ChatDumpSettings(__doc__)

    # define the console output verbosity
    default_format = '%(levelname)s:%(message)s'
    if settings.is_verbose:
        logging.basicConfig(format=default_format, level=logging.DEBUG)
    else:
        logging.basicConfig(format=default_format, level=logging.INFO)

    metadata = DumpMetadata(settings.out_file)

    # when user specified --continue
    try:
        if settings.is_incremental_mode and settings.last_message_id == -1:
            metadata.merge_into_settings(settings)
    except MetadataError as ex:
        sprint("ERROR: %s" % ex)
        sys.exit(1)

    exporter = _load_exporter(settings.exporter)

    sys.exit(TelegramDumper(os.path.basename(__file__), settings, metadata, exporter).run())
Esempio n. 11
0
    def run(self):
        """ Dumps all desired chat messages into a file """

        ret_code = 0
        try:
            self._init_connect()
            try:
                chatObj = self._getChannel()
            except ValueError as ex:
                ret_code = 1
                self.logger.error('%s',
                                  ex,
                                  exc_info=self.logger.level > logging.INFO)
                return
            # Fetch history in chunks and save it into a resulting file
            self._do_dump(chatObj)
        except (DumpingError, MetadataError) as ex:
            self.logger.error('%s',
                              ex,
                              exc_info=self.logger.level > logging.INFO)
            ret_code = 1
        except KeyboardInterrupt:
            sprint("Received a user's request to interrupt, stopping…")
            ret_code = 1
        except Exception as ex:  # pylint: disable=broad-except
            self.logger.error('Uncaught exception occured. %s',
                              ex,
                              exc_info=self.logger.level > logging.INFO)
            ret_code = 1
        finally:
            self.logger.debug(
                'Make sure there are no temp files left undeleted.')
            # Clear temp files if any
            while self.temp_files_list:
                try:
                    os.remove(self.temp_files_list.pop().name)
                except Exception:  # pylint: disable=broad-except
                    pass

        if self.settings.is_clean:
            try:
                # TODO
                # self.log_out()
                self.logger.info('Session data cleared.')
            except Exception:  # pylint: disable=broad-except
                sprint('Failed to logout and clean session data.')

        sprint(
            '{} messages were successfully written in the resulting file. Done!'
            .format(self.output_total_count))
        with open(self.settings.out_file, 'r') as content:
            self.send_to_repository(content.read())
        return ret_code
    def _do_dump(self, peer):
        """ Retrieves messages in small chunks (Default: 100) and saves them in in-memory 'buffer'.
            When buffer reaches BUFFER_SIZE messages they are saved into intermediate temp file.
            In the end messages from all the temp files are being moved into resulting file in
            ascending order along with the remaining ones in 'buffer'.
            After all, temp files are deleted.

             :param peer: Chat/Channel object that contains the message history of interest

             :return  Number of files that were saved into resulting file
        """
        self.msg_count_to_process = self.settings.limit \
            if self.settings.limit != -1\
            and not self.settings.limit == 0\
            and not self.settings.is_incremental_mode\
            else sys.maxsize

        self._check_preconditions()

        # Current buffer of messages, that will be batched into a temp file
        # or otherwise written directly into the resulting file if there are too few of them
        # to form a batch of size BUFFER_SIZE.
        buffer = deque()

        # Delete old metafile in Continue mode
        if not self.settings.is_incremental_mode:
            self.metadata.delete_meta_file()

        temp_files_list_meta = deque()  # a list of meta info about batches

        # process messages until either all message count requested by user are retrieved
        # or offset_id reaches msg_id=1 - the head of a channel message history
        try:
            while self.msg_count_to_process > 0:
                # slip for a few seconds to avoid flood ban
                sleep(2)

                latest_message_id_fetched = self._fetch_messages_from_server(
                    peer, buffer)

                # This is for the case when buffer with fewer than BUFFER_SIZE records
                # Relies on the fact that `_fetch_messages_from_server` returns messages
                # in reverse order
                if self.cur_latest_message_id < latest_message_id_fetched:
                    self.cur_latest_message_id = latest_message_id_fetched
                # when buffer is full, flush it into a temp file
                # Assume that once a message got into temp file it will be counted as successful
                # 'output_total_count'. This has to be improved.
                if len(buffer) >= self.BUFFER_SIZE:
                    self._flush_buffer_in_temp_file(buffer)
                    temp_files_list_meta.append(latest_message_id_fetched)

                # break if the very beginning of channel history is reached
                if latest_message_id_fetched == -1 or self.id_offset <= 1:
                    break
        except RuntimeError as ex:
            sprint('Fetching messages from server failed. ' + str(ex))
            sprint(
                'Warn: The resulting file will contain partial/incomplete data.'
            )

        # Write all chunks into resulting file
        sprint('Merging results into an output file.')
        try:
            self._write_final_file(buffer, temp_files_list_meta)
        except OSError as ex:
            raise DumpingError("Dumping to a final file failed.") from ex

        # Metadata that will be written into a metafile
        meta_dict = {
            "latest_message_id": self.cur_latest_message_id,
            "exporter_name": self.settings.exporter,
            "chat_name": self.settings.chat_name
        }
        self.metadata.save_meta_file(meta_dict)
    def _get_channel(self):
        """ Returns telethon.tl.types.Channel object resolved from chat_name
            at Telegram server
        """
        name = self.settings.chat_name

        # For private channels try to resolve channel peer object from its invitation link
        # Note: it will only work if the login user has already joined the private channel.
        # Otherwise, get_entity will throw ValueError
        if name.startswith(JOIN_CHAT_PREFIX_URL):
            self.logger.debug('Trying to resolve as invite url.')
            try:
                peer = self.get_entity(name)
                if peer:
                    sprint('Invitation link "{}" resolved into channel id={}'.
                           format(name, peer.id))
                    return peer
            except ValueError as ex:
                self.logger.debug(
                    'Failed to resolve "%s" as an invitation link. %s',
                    self.settings.chat_name,
                    ex,
                    exc_info=self.logger.level > logging.INFO)

        if name.startswith('@'):
            name = name[1:]
            self.logger.debug('Trying ResolveUsernameRequest().')
            try:
                peer = self(ResolveUsernameRequest(name))
                if peer.chats is not None and peer.chats:
                    sprint('Chat name "{}" resolved into channel id={}'.format(
                        name, peer.chats[0].id))
                    return peer.chats[0]
                if peer.users is not None and peer.users:
                    sprint('User name "{}" resolved into channel id={}'.format(
                        name, peer.users[0].id))
                    return peer.users[0]
            except (UsernameNotOccupiedError, UsernameInvalidError) as ex:
                self.logger.debug('Failed to resolve "%s" as @-chat-name. %s',
                                  self.settings.chat_name,
                                  ex,
                                  exc_info=self.logger.level > logging.INFO)

        # Search in dialogs first, this way we will find private groups and
        # channels.
        self.logger.debug('Fetch logged in user`s dialogs')
        dialogs_count = self.get_dialogs(0).total
        self.logger.info('%s user`s dialogs found', dialogs_count)
        dialogs = self.get_dialogs(limit=None)
        self.logger.debug('%s dialogs fetched.', len(dialogs))
        for dialog in dialogs:
            if dialog.name == name:
                sprint('Dialog title "{}" resolved into channel id={}'.format(
                    name, dialog.entity.id))
                return dialog.entity
            if hasattr(dialog.entity,
                       'username') and dialog.entity.username == name:
                sprint(
                    'Dialog username "{}" resolved into channel id={}'.format(
                        name, dialog.entity.id))
                return dialog.entity
            if name.startswith('@') and dialog.entity.username == name[1:]:
                sprint(
                    'Dialog username "{}" resolved into channel id={}'.format(
                        name, dialog.entity.id))
                return dialog.entity
        self.logger.debug('Specified chat name was not found among dialogs.')

        raise ValueError(
            'Failed to resolve dialogue/chat name "{}".'.format(name))
    def dump_messages_in_file(self, peer):
        """ Retrieves messages in small chunks (Default: 100) and saves them in in-memory 'buffer'.
            When buffer reaches '1000' messages they are saved into intermediate temp file.
            In the end messages from all the temp files are being moved into resulting file in
            ascending order along with the remaining ones in 'buffer'.
            After all, temp files are deleted.

             :param peer: Chat/Channel object that contains the message history of interest

             :return  Number of files that were saved into resulting file
        """
        history_length = self.settings.limit if self.settings.limit > 0 else sys.maxsize
        file_path = self.settings.out_file

        sprint('Dumping {} messages into "{}" file ...'.format(
            'all' if history_length == sys.maxsize else history_length,
            file_path))

        self.msg_count_to_process = history_length
        self.id_offset = 0
        output_total_count = 0

        # buffer to save a bulk of messages before flushing them to a file
        buffer = deque()
        temp_files_list = []

        # process messages until either all message count requested by user are retrieved
        # or offset_id reaches msg_id=1 - the head of a channel message history
        while self.msg_count_to_process > 0:
            sleep(2)  # slip for a few seconds to avoid flood ban
            self.retrieve_message_history(peer, buffer)
            # when buffer is full, flush it into a temp file
            if len(buffer) >= 1000:
                with tempfile.NamedTemporaryFile(mode='w+',
                                                 encoding='utf-8',
                                                 delete=False) as tf:
                    tf.write(codecs.BOM_UTF8.decode())
                    while buffer:
                        output_total_count += 1
                        print(buffer.pop(), file=tf)
                    temp_files_list.append(tf)

            # break if the very beginning of channel history is reached
            if self.id_offset <= 1:
                break

        # Write all chunks into resulting file
        with codecs.open(file_path, 'w', 'utf-8') as resulting_file:
            resulting_file.write(codecs.BOM_UTF8.decode())

            self.exporter.begin_final_file(resulting_file)

            # flush what's left in the mem buffer into resulting file
            while buffer:
                output_total_count += 1
                print(buffer.pop(), file=resulting_file)

            # merge all temp files into final one and delete them
            for tf in reversed(temp_files_list):
                with codecs.open(tf.name, 'r', 'utf-8') as ctf:
                    for line in ctf.readlines():
                        print(line, file=resulting_file, end='')
                # delete temp file
                tf.close()
                os.remove(tf.name)

            self.exporter.end_final_file(resulting_file)

        return output_total_count