Esempio n. 1
0
def get_file_location(media):
    """
    Helper method to turn arbitrary media into (InputFileLocation, size/None).
    """
    location = file_size = None
    if isinstance(media, types.MessageMediaPhoto):
        media = media.photo

    if isinstance(media, types.Photo):
        for size in reversed(media.sizes):
            if isinstance(size, types.PhotoSize):
                if isinstance(size.location, types.FileLocation):
                    file_size = size.size
                    location = size.location
                    break
    elif isinstance(media, types.MessageMediaDocument):
        if isinstance(media.document, types.Document):
            file_size = media.document.size
            location = types.InputDocumentFileLocation(
                id=media.document.id,
                access_hash=media.document.access_hash,
                version=media.document.version)
    elif isinstance(media, (types.UserProfilePhoto, types.ChatPhoto)):
        if isinstance(media.photo_big, types.FileLocation):
            location = media.photo_big
        elif isinstance(media.photo_small, types.FileLocation):
            location = media.photo_small

    if isinstance(location, types.FileLocation):
        location = types.InputFileLocation(volume_id=location.volume_id,
                                           local_id=location.local_id,
                                           secret=location.secret)

    return location, file_size
Esempio n. 2
0
    def download_profile_photo(self, photo, target, known_id=None):
        """
        Similar to Downloader.download_media() but for profile photos.

        Has no effect if there is no photo format (thus it is "disabled").
        """
        if not self.photo_fmt:
            return

        date = datetime.datetime.now()
        if isinstance(photo, (types.UserProfilePhoto, types.ChatPhoto)):
            if isinstance(photo.photo_big, types.FileLocation):
                location = photo.photo_big
            elif isinstance(photo.photo_small, types.FileLocation):
                location = photo.photo_small
            else:
                return
        elif isinstance(photo, types.Photo):
            for size in photo.sizes:
                if isinstance(size, types.PhotoSize):
                    if isinstance(size.location, types.FileLocation):
                        location = size.location
                        break
            else:
                return
            date = photo.date
            if known_id is None:
                known_id = photo.id
        else:
            return

        if known_id is None:
            known_id = utils.get_peer_id(target)

        formatter = defaultdict(
            str,
            id=known_id,
            context_id=utils.get_peer_id(target),
            sender_id=utils.get_peer_id(target),
            ext='.jpg',
            type='chatphoto',
            filename=date.strftime('chatphoto_%Y-%m-%d_%H-%M-%S'),
            name=utils.get_display_name(target) or 'unknown',
            sender_name=utils.get_display_name(target) or 'unknown')
        filename = date.strftime(self.photo_fmt).format_map(formatter)
        if not filename.endswith(formatter['ext']):
            if filename.endswith('.'):
                filename = filename[:-1]
            filename += formatter['ext']

        os.makedirs(os.path.dirname(filename), exist_ok=True)
        return self.client.download_file(types.InputFileLocation(
            volume_id=location.volume_id,
            local_id=location.local_id,
            secret=location.secret),
                                         file=filename,
                                         part_size_kb=256)
Esempio n. 3
0
    async def _download_media(self, media_id, context_id, sender_id, date,
                              bar):
        media_row = self.dumper.conn.execute(
            'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size '
            'FROM Media WHERE ID = ?', (media_id, )).fetchone()
        # Documents have attributes and they're saved under the "document"
        # namespace so we need to split it before actually comparing.
        media_type = media_row[3].split('.')
        media_type, media_subtype = media_type[0], media_type[-1]
        if media_type not in ('photo', 'document'):
            return  # Only photos or documents are actually downloadable

        formatter = defaultdict(str,
                                context_id=context_id,
                                sender_id=sender_id,
                                type=media_subtype or 'unknown',
                                name=self._get_name(context_id) or 'unknown',
                                sender_name=self._get_name(sender_id)
                                or 'unknown')

        # Documents might have a filename, which may have an extension. Use
        # the extension from the filename if any (more accurate than mime).
        ext = None
        filename = media_row[5]
        if filename:
            filename, ext = os.path.splitext(filename)
        else:
            # No filename at all, set a sensible default filename
            filename = date.strftime('{}_%Y-%m-%d_%H-%M-%S'.format(
                formatter['type']))

        # The saved media didn't have a filename and we set our own.
        # Detect a sensible extension from the known mimetype.
        if not ext:
            ext = export_utils.get_extension(media_row[4])

        # Apply the date to the user format string and then replace the map
        formatter['filename'] = filename
        filename = date.strftime(self.media_fmt).format_map(formatter)
        filename += '.{}{}'.format(media_id, ext)
        if os.path.isfile(filename):
            __log__.debug('Skipping already-existing file %s', filename)
            return

        __log__.debug('Downloading to %s', filename)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        if media_type == 'document':
            location = types.InputDocumentFileLocation(
                id=media_row[0],
                version=media_row[1],
                access_hash=media_row[2])
        else:
            location = types.InputFileLocation(local_id=media_row[0],
                                               volume_id=media_row[1],
                                               secret=media_row[2])

        def progress(saved, total):
            """Increment the tqdm progress bar"""
            if total is None:
                # No size was found so the bar total wasn't incremented before
                bar.total += saved
                bar.update(saved)
            elif saved == total:
                # Downloaded the last bit (which is probably <> part size)
                mod = (saved % DOWNLOAD_PART_SIZE) or DOWNLOAD_PART_SIZE
                bar.update(mod)
            else:
                # All chunks are of the same size and this isn't the last one
                bar.update(DOWNLOAD_PART_SIZE)

        if media_row[6] is not None:
            bar.total += media_row[6]

        self._incomplete_download = filename
        await self.client.download_file(location,
                                        file=filename,
                                        file_size=media_row[6],
                                        part_size_kb=DOWNLOAD_PART_SIZE //
                                        1024,
                                        progress_callback=progress)
        self._incomplete_download = None
Esempio n. 4
0
    async def _download_media(self, media_id, context_id, sender_id, date,
                              progress):
        media_row = await db_media.find_one({'_id': media_id})
        progress.name = media_row['name']
        if media_row['size']:
            if media_row['size'] > self.max_size:
                logger.warning('忽略过大文件:%s', media_row['name'])
                return  # 忽略过大的文件
            if media_row['size'] < self.min_size:
                logger.warning('忽略过小文件:%s', media_row['name'])
                return  # 忽略过小的文件
        # Documents have attributes and they're saved under the "document"
        # namespace so we need to split it before actually comparing.
        media_type = media_row['type'].split('.')
        media_type, media_subtype = media_type[0], media_type[-1]
        if media_type not in ('document', 'photo'):
            logger.info('忽略文档类型:%s', media_type)
            return  # Only photos or documents are actually downloadable

        formatter = defaultdict(str,
                                context_id=context_id,
                                sender_id=sender_id,
                                type=media_subtype or 'unknown',
                                name=await self._get_name(context_id)
                                or 'unknown',
                                sender_name=await self._get_name(sender_id)
                                or 'unknown')

        # Documents might have a filename, which may have an extension. Use
        # the extension from the filename if any (more accurate than mime).
        ext = None
        filename = media_row['name']
        if filename:
            filename, ext = os.path.splitext(filename)
        else:
            # No filename at all, set a sensible default filename
            filename = arrow.get(date).format('YYYYMMDDHHmmssSSS')
            logger.debug('忽略无名称文件')
            return

        # The saved media didn't have a filename and we set our own.
        # Detect a sensible extension from the known mimetype.
        if not ext:
            ext = export_utils.get_extension(media_row['mime_type'])

        # Apply the date to the user format string and then replace the map
        formatter['filename'] = fix_windows_filename(filename)
        filename = date.strftime(self.media_fmt).format_map(formatter)
        filename += '.{}{}'.format(media_id, ext)
        if os.path.isfile(filename):
            logger.debug('Skipping already-existing file %s', filename)
            return
        logger.info('正在下载:%s 至 %s', media_type, filename)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        if media_type == 'document':
            location = types.InputDocumentFileLocation(
                id=media_row['local_id'],
                version=media_row['volume_id'],
                access_hash=media_row['secret'])
        else:
            location = types.InputFileLocation(
                local_id=media_row['local_id'],
                volume_id=media_row['volume_id'],
                secret=media_row['secret'])

        def progress_callback(saved, total):
            """Increment the tqdm progress bar"""
            if total is None:
                # No size was found so the bar total wasn't incremented before
                progress.total += saved
                progress.inc(saved)
            elif saved == total:
                # Downloaded the last bit (which is probably <> part size)
                mod = (saved % DOWNLOAD_PART_SIZE) or DOWNLOAD_PART_SIZE
                progress.inc(mod)
            else:
                # All chunks are of the same size and this isn't the last one
                progress.inc(DOWNLOAD_PART_SIZE)

        if media_row['size'] is not None:
            progress.total += media_row['size']

        self._incomplete_download = filename
        await self.client.download_file(location,
                                        file=filename,
                                        file_size=media_row['size'],
                                        part_size_kb=DOWNLOAD_PART_SIZE //
                                        1024,
                                        progress_callback=progress_callback)
        self._incomplete_download = None
Esempio n. 5
0
    def download_past_media(self, dumper, target_id):
        """
        Downloads the past media that has already been dumped into the
        database but has not been downloaded for the given target ID yet.

        Media which formatted filename results in an already-existing file
        will be *ignored* and not re-downloaded again.
        """
        # TODO Should this respect and download only allowed media? Or all?
        target_in = self.client.get_input_entity(target_id)
        target = self.client.get_entity(target_in)
        target_id = utils.get_peer_id(target)

        msg_cursor = dumper.conn.cursor()
        msg_cursor.execute(
            'SELECT ID, Date, FromID, MediaID FROM Message '
            'WHERE ContextID = ? AND MediaID IS NOT NULL', (target_id, ))

        msg_row = msg_cursor.fetchone()
        while msg_row:
            media_row = dumper.conn.execute(
                'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name '
                'FROM Media WHERE ID = ?', (msg_row[3], )).fetchone()
            # Documents have attributed and they're saved under the "document"
            # namespace so we need to split it before actually comparing.
            media_type = media_row[3].split('.')
            media_type, media_subtype = media_type[0], media_type[-1]
            if media_type not in ('photo', 'document'):
                # Only photos or documents are actually downloadable
                msg_row = msg_cursor.fetchone()
                continue

            user_row = dumper.conn.execute(
                'SELECT FirstName, LastName FROM User WHERE ID = ?',
                (msg_row[2], )).fetchone()
            if user_row:
                sender_name = '{} {}'.format(msg_row[0] or '', msg_row[1]
                                             or '').strip()
            else:
                sender_name = ''

            date = datetime.datetime.utcfromtimestamp(msg_row[1])
            formatter = defaultdict(str,
                                    id=msg_row[0],
                                    context_id=target_id,
                                    sender_id=msg_row[2] or 0,
                                    type=media_subtype or 'unknown',
                                    ext=mimetypes.guess_extension(media_row[4])
                                    or '.bin',
                                    name=utils.get_display_name(target)
                                    or 'unknown',
                                    sender_name=sender_name or 'unknown')
            if formatter['ext'] == '.jpe':
                formatter['ext'] = '.jpg'  # Nobody uses .jpe for photos

            name = None if media_subtype == 'photo' else media_row[5]
            formatter['filename'] = name or date.strftime(
                '{}_%Y-%m-%d_%H-%M-%S'.format(formatter['type']))
            filename = date.strftime(self.media_fmt).format_map(formatter)
            if not filename.endswith(formatter['ext']):
                if filename.endswith('.'):
                    filename = filename[:-1]
                filename += formatter['ext']

            if os.path.isfile(filename):
                __log__.debug('Skipping existing file %s', filename)
            else:
                __log__.info('Downloading to %s', filename)
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                if media_type == 'document':
                    self.client.download_file(types.InputDocumentFileLocation(
                        id=media_row[0],
                        version=media_row[1],
                        access_hash=media_row[2]),
                                              file=filename)
                else:
                    self.client.download_file(types.InputFileLocation(
                        local_id=media_row[0],
                        volume_id=media_row[1],
                        secret=media_row[2]),
                                              file=filename)
                time.sleep(1)
            msg_row = msg_cursor.fetchone()