def calculate_download_size(self, dl_propics, dl_photos, dl_docs, docs_max_size=None, before_date=None, after_date=None): """Estimates the download size, given some parameters""" with TLDatabase(self.backup_dir) as db: total_size = 0 # TODO How does Telegram Desktop find out the profile photo size? if dl_propics: total_size += db.count( 'users where photo not null') * AVERAGE_PROPIC_SIZE if dl_photos: for msg in db.query_messages( self.get_query(MessageMediaPhoto, before_date, after_date)): total_size += msg.media.photo.sizes[-1].size if dl_docs: for msg in db.query_messages( self.get_query(MessageMediaDocument, before_date, after_date)): if not docs_max_size or msg.media.document.size <= docs_max_size: total_size += msg.media.document.size return total_size
def __init__(self, client, entity, download_delay=1, download_chunk_size=100): """ :param client: An initialized TelegramClient, which will be used to download the messages :param entity: The entity (user, chat or channel) from which the backup will be made :param download_delay: The download delay, in seconds, after a message chunk is downloaded :param download_chunk_size: The chunk size (i.e. how many messages do we download every time) The maximum allowed by Telegram is 100 """ self.client = client self.entity = entity self.download_delay = download_delay self.download_chunk_size = download_chunk_size self.backup_dir = path.join(Backuper.backups_dir, str(entity.id)) self.media_handler = MediaHandler(self.backup_dir) # Open and close the database to create the require directories TLDatabase(self.backup_dir).close() # Set up all the directories and files that we'll be needing self.files = { 'entity': path.join(self.backup_dir, 'entity.tlo'), 'metadata': path.join(self.backup_dir, 'metadata.json') } # TODO Crashes if the other user got us blocked (AttributeError: 'NoneType' object has no attribute 'photo_big') # Is the backup running (are messages being downloaded?) self.backup_running = False # Event that gets fired when metadata is saved self.on_metadata_change = None # Save the entity and load the metadata with open(self.files['entity'], 'wb') as file: with BinaryWriter(file) as writer: entity.on_send(writer) self.metadata = self.load_metadata()
def backup_media_thread(self, dl_propics, dl_photos, dl_docs, docs_max_size=None, before_date=None, after_date=None, progress_callback=None): """Backups the specified media contained in the given database file""" self.backup_running = True # Create a connection to the database db = TLDatabase(self.backup_dir) # Store how many bytes we have/how many bytes there are in total current = 0 total = self.calculate_download_size(dl_propics, dl_photos, dl_docs, docs_max_size, after_date, before_date) # Keep track from when we started to determine the estimated time left start = datetime.now() if dl_propics: # TODO Also query chats and channels for user in db.query_users('where photo not null'): if not self.backup_running: return # Try downloading the photo output = self.media_handler.get_propic_path(user) try: if not self.valid_file_exists(output): self.client.download_profile_photo( user.photo, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading profile photo:', e) finally: current += AVERAGE_PROPIC_SIZE if progress_callback: progress_callback(current, total, self.calculate_etl(current, total, start)) if dl_photos: for msg in db.query_messages(self.get_query(MessageMediaPhoto, before_date, after_date)): if not self.backup_running: return # Try downloading the photo output = self.media_handler.get_msg_media_path(msg) try: if not self.valid_file_exists(output): self.client.download_msg_media( msg.media, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading photo:', e) finally: current += msg.media.photo.sizes[-1].size if progress_callback: progress_callback(current, total, self.calculate_etl(current, total, start)) # TODO Add an internal callback to determine how the current document download is going, # and update our currently saved bytes count based on that if dl_docs: for msg in db.query_messages(self.get_query(MessageMediaDocument, before_date, after_date)): if not self.backup_running: return if not docs_max_size or msg.media.document.size <= docs_max_size: # Try downloading the document output = self.media_handler.get_msg_media_path(msg) try: if not self.valid_file_exists(output): self.client.download_msg_media( msg.media, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading document:', e) finally: current += msg.media.document.size if progress_callback: progress_callback(current, total, self.calculate_etl(current, total, start)) db.close()
def backup_messages_thread(self): """This method backups the messages and should be ran in a different thread""" self.backup_running = True # Create a connection to the database db = TLDatabase(self.backup_dir) # Determine whether we started making the backup from the very first message or not. # If this is the case: # We won't need to come back to the first message again after we've finished downloading # them all, since that first message will already be in backup. # # Otherwise, if we did not start from the first message: # More messages were in the backup already, and after we backup those "left" ones, # we must return to the first message and backup until where we started. started_at_0 = self.metadata['resume_msg_id'] == 0 # Keep an internal downloaded count for it to be faster # (instead of querying the database all the time) self.metadata['saved_msgs'] = db.count('messages') # We also need to keep track of how many messages we've downloaded now # in order to calculate the estimated time left properly saved_msgs_now = 0 # Make the backup try: # We need this to invoke GetHistoryRequest input_peer = get_input_peer(self.entity) # Keep track from when we started to determine the estimated time left start = datetime.now() # Enter the download-messages main loop while self.backup_running: # Invoke the GetHistoryRequest to get the next messages after those we have result = self.client.invoke(GetHistoryRequest( peer=input_peer, offset_id=self.metadata['resume_msg_id'], limit=self.download_chunk_size, offset_date=None, add_offset=0, max_id=0, min_id=0 )) self.metadata['total_msgs'] = getattr(result, 'count', len(result.messages)) # First add users and chats, replacing any previous value for user in result.users: db.add_object(user, replace=True) for chat in result.chats: db.add_object(chat, replace=True) # Then add the messages to the backup for msg in result.messages: if db.in_table(msg.id, 'messages'): # If the message we retrieved was already saved, this means that we're # done because we have the rest of the messages. # Clear the list so we enter the next if, and break to early terminate self.metadata['resume_msg_id'] = result.messages[-1].id del result.messages[:] break else: db.add_object(msg) saved_msgs_now += 1 self.metadata['saved_msgs'] += 1 self.metadata['resume_msg_id'] = msg.id self.metadata['etl'] = str(self.calculate_etl( saved_msgs_now, self.metadata['total_msgs'], start=start)) # Always commit at the end to save changes db.commit() self.save_metadata() # The list can be empty because we've either used a too big offset # (in which case we have all the previous messages), or we've reached # a point where we have the upcoming messages (so there's no need to # download them again and we stopped) if not result.messages: # We've downloaded all the messages since the last backup if started_at_0: # And since we started from the very first message, we have them all print('Downloaded all {}'.format(self.metadata['total_msgs'])) break else: # We need to start from the first message (latest sent message) # and backup again until we have them all self.metadata['resume_msg_id'] = 0 started_at_0 = True # Always sleep a bit, or Telegram will get angry and tell us to chill sleep(self.download_delay) pass # end while except KeyboardInterrupt: print('Operation cancelled, not downloading more messages!') # Also commit here, we don't want to lose any information! db.commit() self.save_metadata() finally: self.backup_running = False
def backup_media_thread(self, dl_propics, dl_photos, dl_docs, docs_max_size=None, before_date=None, after_date=None, progress_callback=None): """Backups the specified media contained in the given database file""" self.backup_running = True # Create a connection to the database db = TLDatabase(self.backup_dir) # Store how many bytes we have/how many bytes there are in total current = 0 total = self.calculate_download_size(dl_propics, dl_photos, dl_docs, docs_max_size, after_date, before_date) # Keep track from when we started to determine the estimated time left start = datetime.now() if dl_propics: # TODO Also query chats and channels for user in db.query_users('where photo not null'): if not self.backup_running: return # Try downloading the photo output = self.media_handler.get_propic_path(user) try: if not self.valid_file_exists(output): self.client.download_profile_photo(user.photo, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading profile photo:', e) finally: current += AVERAGE_PROPIC_SIZE if progress_callback: progress_callback( current, total, self.calculate_etl(current, total, start)) if dl_photos: for msg in db.query_messages( self.get_query(MessageMediaPhoto, before_date, after_date)): if not self.backup_running: return # Try downloading the photo output = self.media_handler.get_msg_media_path(msg) try: if not self.valid_file_exists(output): self.client.download_msg_media(msg.media, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading photo:', e) finally: current += msg.media.photo.sizes[-1].size if progress_callback: progress_callback( current, total, self.calculate_etl(current, total, start)) # TODO Add an internal callback to determine how the current document download is going, # and update our currently saved bytes count based on that if dl_docs: for msg in db.query_messages( self.get_query(MessageMediaDocument, before_date, after_date)): if not self.backup_running: return if not docs_max_size or msg.media.document.size <= docs_max_size: # Try downloading the document output = self.media_handler.get_msg_media_path(msg) try: if not self.valid_file_exists(output): self.client.download_msg_media(msg.media, add_extension=False, file_path=output) sleep(self.download_delay) except RPCError as e: print('Error downloading document:', e) finally: current += msg.media.document.size if progress_callback: progress_callback( current, total, self.calculate_etl(current, total, start)) db.close()
def backup_messages_thread(self): """This method backups the messages and should be ran in a different thread""" self.backup_running = True # Create a connection to the database db = TLDatabase(self.backup_dir) # Determine whether we started making the backup from the very first message or not. # If this is the case: # We won't need to come back to the first message again after we've finished downloading # them all, since that first message will already be in backup. # # Otherwise, if we did not start from the first message: # More messages were in the backup already, and after we backup those "left" ones, # we must return to the first message and backup until where we started. started_at_0 = self.metadata['resume_msg_id'] == 0 # Keep an internal downloaded count for it to be faster # (instead of querying the database all the time) self.metadata['saved_msgs'] = db.count('messages') # We also need to keep track of how many messages we've downloaded now # in order to calculate the estimated time left properly saved_msgs_now = 0 # Make the backup try: # We need this to invoke GetHistoryRequest input_peer = self.entity # Keep track from when we started to determine the estimated time left start = datetime.now() # Enter the download-messages main loop self.client.connect() while self.backup_running: # Invoke the GetHistoryRequest to get the next messages after those we have result = self.client.invoke( GetHistoryRequest(peer=input_peer, offset_id=self.metadata['resume_msg_id'], limit=self.download_chunk_size, offset_date=None, add_offset=0, max_id=0, min_id=0)) # For some strange reason, GetHistoryRequest might return upload.file.File # Ensure we retrieved Messages or MessagesSlice if not isinstance(result, Messages) and not isinstance(result, MessagesSlice) \ and not isinstance(result, ChannelMessages): print('Invalid result type when downloading messages:', type(result)) sleep(self.download_delay) continue self.metadata['total_msgs'] = getattr(result, 'count', len(result.messages)) # First add users and chats, replacing any previous value for user in result.users: db.add_object(user, replace=True) for chat in result.chats: db.add_object(chat, replace=True) # Then add the messages to the backup for msg in result.messages: if db.in_table(msg.id, 'messages'): # If the message we retrieved was already saved, this means that we're # done because we have the rest of the messages. # Clear the list so we enter the next if, and break to early terminate self.metadata['resume_msg_id'] = result.messages[-1].id del result.messages[:] break else: db.add_object(msg) saved_msgs_now += 1 self.metadata['saved_msgs'] += 1 self.metadata['resume_msg_id'] = msg.id self.metadata['etl'] = str( self.calculate_etl(saved_msgs_now, self.metadata['total_msgs'], start=start)) # Always commit at the end to save changes db.commit() self.save_metadata() # The list can be empty because we've either used a too big offset # (in which case we have all the previous messages), or we've reached # a point where we have the upcoming messages (so there's no need to # download them again and we stopped) if not result.messages: # We've downloaded all the messages since the last backup if started_at_0: # And since we started from the very first message, we have them all print('Downloaded all {}'.format( self.metadata['total_msgs'])) break else: # We need to start from the first message (latest sent message) # and backup again until we have them all self.metadata['resume_msg_id'] = 0 started_at_0 = True # Always sleep a bit, or Telegram will get angry and tell us to chill sleep(self.download_delay) pass # end while except KeyboardInterrupt: print('Operation cancelled, not downloading more messages!') # Also commit here, we don't want to lose any information! db.commit() self.save_metadata() finally: self.backup_running = False
def export_thread(self, callback): """The exporting a conversation method (should be ran in a different thread)""" with TLDatabase(self.backups_dir) as db: db_media_handler = MediaHandler(self.backups_dir) # First copy the default media files self.copy_default_media() progress = { 'exported': 0, 'total': db.count('messages'), 'etl': 'Unknown' } # The first date will obviously be the first day # TODO This fails if there are 0 messages in the database, export should be disabled! previous_date = self.get_message_date( db.query_message('order by id asc')) # Also find the next day following_date = self.get_previous_and_next_day(db, previous_date)[1] # Set the first writer (which will have the "previous" date, the first one) writer = HTMLTLWriter(previous_date, self.media_handler, following_date=following_date) # Keep track from when we started to determine the estimated time left start = datetime.now() # Export the profile photos, from users chats and channels # TODO This should also have a progress if we have a backup of thousands of files! for user in db.query_users(): if user.photo: source = db_media_handler.get_propic_path(user) output = self.media_handler.get_propic_path(user) if isfile(source): copyfile(source, output) # Iterate over all the messages to export them in their respective days for msg in db.query_messages('order by id asc'): msg_date = self.get_message_date(msg) progress['exported'] += 1 # As soon as we're in the next day, update the output the writer if msg_date != previous_date: # Exit the previous writer to end the header writer.__exit__(None, None, None) # Update date values and create a new instance previous_date, following_date =\ self.get_previous_and_next_day(db, msg_date) writer = HTMLTLWriter(msg_date, self.media_handler, previous_date=previous_date, following_date=following_date) # Call the callback if callback: progress['etl'] = self.calculate_etl( start, progress['exported'], progress['total']) callback(progress) else: print(progress) writer.write_message(msg, db) # If the message has media, we need to copy it so it's accessible by the exported HTML if not isinstance(msg, MessageService) and msg.media: source = db_media_handler.get_msg_media_path(msg) output = self.media_handler.get_msg_media_path(msg) # Source may be None if the media is unsupported (i.e. a webpage) if source and isfile(source): copyfile(source, output) previous_date = msg_date # Always exit at the end writer.__exit__(None, None, None) # Call the callback to notify we've finished if callback: progress['etl'] = timedelta(seconds=0) callback(progress)