Ejemplo n.º 1
0
    def __init__(self, api: RestClient, root_folder: Path, db: LocalData,
                 settings: Settings):
        """
        Parameters:
            api: object representing the Google REST API
            root_folder: path to the root of local file synchronization
            db: local database for indexing
            settings: further arguments
        """
        self._db: LocalData = db
        self._root_folder: Path = root_folder
        self._api: RestClient = api

        self.files_downloaded: int = 0
        self.files_download_started: int = 0
        self.files_download_skipped: int = 0
        self.files_download_failed: int = 0

        self.settings = settings
        self.max_threads = settings.max_threads
        self.start_date: datetime = settings.start_date
        self.end_date: datetime = settings.end_date
        self.retry_download: bool = settings.retry_download
        self.case_insensitive_fs: bool = settings.case_insensitive_fs
        self.video_timeout: int = 2000
        self.image_timeout: int = 60

        # attributes related to multi-threaded download
        self.download_pool = futures.ThreadPoolExecutor(
            max_workers=self.max_threads)
        self.pool_future_to_media = {}
        self.bad_ids = BadIds(self._root_folder)

        self.current_umask = os.umask(7)
        os.umask(self.current_umask)

        self._session = requests.Session()
        # define the retry behaviour for each connection. Note that
        # respect_retry_after_header=True means that status codes [413, 429, 503]
        # will backoff for the recommended period defined in the retry after header
        retries = Retry(
            total=settings.max_retries,
            backoff_factor=5,
            status_forcelist=[500, 502, 503, 504, 509],
            method_whitelist=frozenset(["GET", "POST"]),
            raise_on_status=False,
            respect_retry_after_header=True,
        )
        self._session.mount(
            "https://",
            HTTPAdapter(max_retries=retries, pool_maxsize=self.max_threads))
Ejemplo n.º 2
0
    def __init__(self,
                 api: RestClient,
                 root_folder: Path,
                 db: LocalData,
                 max_retries: int = 5,
                 max_threads: int = 20):
        """
        Parameters:
            api: object representing the Google REST API
            root_folder: path to the root of local file synchronization
            db: local database for indexing
        """
        self._db: LocalData = db
        self._root_folder: Path = root_folder
        self._api: RestClient = api
        self.max_threads = max_threads

        self.files_downloaded: int = 0
        self.files_download_started: int = 0
        self.files_download_skipped: int = 0
        self.files_download_failed: int = 0

        # attributes to be set after init
        # thus in theory one instance could so multiple indexes
        self.start_date: datetime = None
        self.end_date: datetime = None
        self.retry_download: bool = False
        self.video_timeout: int = 2000
        self.image_timeout: int = 60
        self.case_insensitive_fs: bool = False

        # attributes related to multi-threaded download
        self.download_pool = futures.ThreadPoolExecutor(
            max_workers=self.max_threads)
        self.pool_future_to_media = {}
        self.bad_ids = BadIds(self._root_folder)

        self.current_umask = os.umask(7)
        os.umask(self.current_umask)

        self._session = requests.Session()
        retries = Retry(total=max_retries,
                        backoff_factor=0.1,
                        status_forcelist=[500, 502, 503, 504],
                        method_whitelist=frozenset(['GET', 'POST']),
                        raise_on_status=False)
        self._session.mount(
            'https://',
            HTTPAdapter(max_retries=retries, pool_maxsize=self.max_threads))
Ejemplo n.º 3
0
    def test_bad_ids(self, do_download_file):

        do_download_file.side_effect = HTTPError(Mock(status=500), 'ouch!')
        s = ts.SetupDbAndCredentials()
        args = [
            '--start-date', TestAccount.start, '--end-date', TestAccount.end,
            '--skip-albums'
        ]
        s.test_setup('test_bad_ids',
                     args=args,
                     trash_db=True,
                     trash_files=True)
        s.gp.start(s.parsed_args)
        # check we tried to download 10 times
        self.assertEqual(
            do_download_file.call_count, TestAccount.image_count_2016,
            "Expected {} downloads".format(TestAccount.image_count_2016))

        # this should have created a Bad IDs file
        bad_ids = BadIds(s.root)
        self.assertEqual(
            len(bad_ids.items), TestAccount.image_count_2016,
            "Expected {} Bad IDs entries".format(TestAccount.image_count_2016))

        do_download_file.reset_mock()

        s.test_setup('test_bad_ids', args=args)
        s.gp.start(s.parsed_args)
        # this should have skipped the bad ids and not tried to download
        self.assertEqual(do_download_file.call_count, 0,
                         "Expected 0 calls to do_download")
Ejemplo n.º 4
0
    def test_bad_ids(self, do_download_file):

        do_download_file.side_effect = HTTPError(Mock(status=500), 'ouch!')
        s = ts.SetupDbAndCredentials()
        args = [
            '--start-date', '2016-01-01', '--end-date', '2017-01-01',
            '--skip-albums'
        ]
        s.test_setup('test_bad_ids',
                     args=args,
                     trash_db=True,
                     trash_files=True)
        s.gp.start(s.parsed_args)
        # check we tried to download 10 times
        self.assertEqual(do_download_file.call_count, 10)

        # this should have created a Bad IDs file
        bad_ids = BadIds(s.root)
        self.assertEqual(len(bad_ids.items), 10)

        s.test_setup('test_bad_ids', args=args)
        s.gp.start(s.parsed_args)
        # this should have skipped the bad ids and not tried to download
        self.assertEqual(do_download_file.call_count, 10)
Ejemplo n.º 5
0
class GooglePhotosDownload(object):
    """A Class for managing the indexing and download of Google Photos
    """
    PAGE_SIZE: int = 100
    MAX_THREADS: int = 20
    BATCH_SIZE: int = 40

    def __init__(self, api: RestClient, root_folder: Path, db: LocalData):
        """
        Parameters:
            api: object representing the Google REST API
            root_folder: path to the root of local file synchronization
            db: local database for indexing
        """
        self._db: LocalData = db
        self._root_folder: Path = root_folder
        self._api: RestClient = api

        self.files_downloaded: int = 0
        self.files_download_started: int = 0
        self.files_download_skipped: int = 0
        self.files_download_failed: int = 0

        # attributes to be set after init
        # thus in theory one instance could so multiple indexes
        self.start_date: datetime = None
        self.end_date: datetime = None
        self.retry_download: bool = False
        self.video_timeout: int = 2000
        self.image_timeout: int = 60

        # attributes related to multi-threaded download
        self.download_pool = futures.ThreadPoolExecutor(
            max_workers=self.MAX_THREADS)
        self.pool_future_to_media = {}
        self.bad_ids = BadIds(self._root_folder)

        self.current_umask = os.umask(7)
        os.umask(self.current_umask)

        self._session = requests.Session()
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[500, 502, 503, 504],
                        method_whitelist=frozenset(['GET', 'POST']),
                        raise_on_status=False)
        self._session.mount(
            'https://',
            HTTPAdapter(max_retries=retries, pool_maxsize=self.MAX_THREADS))

    def download_photo_media(self):
        """
        here we batch up our requests to get base url for downloading media.
        This avoids the overhead of one REST call per file. A REST call
        takes longer than downloading an image
        """

        def grouper(
                iterable: Iterable[DatabaseMedia]) \
                -> Iterable[Iterable[DatabaseMedia]]:
            """Collect data into chunks size BATCH_SIZE"""
            return zip_longest(*[iter(iterable)] * self.BATCH_SIZE,
                               fillvalue=None)

        if not self.retry_download:
            self.files_download_skipped = self._db.downloaded_count()

        log.warning('Downloading Photos ...')
        try:
            for media_items_block in grouper(
                    self._db.get_rows_by_search(
                        GooglePhotosRow,
                        start_date=self.start_date,
                        end_date=self.end_date,
                        skip_downloaded=not self.retry_download)):
                batch = {}

                items = (mi for mi in media_items_block if mi)
                for media_item in items:
                    local_folder = \
                        self._root_folder / media_item.relative_folder
                    local_full_path = \
                        local_folder / media_item.filename

                    if local_full_path.exists():
                        self.files_download_skipped += 1
                        log.debug('SKIPPED download (file exists) %d %s',
                                  self.files_download_skipped,
                                  media_item.relative_path)
                        self._db.put_downloaded(media_item.id)

                    elif self.bad_ids.check_id_ok(media_item.id):
                        batch[media_item.id] = media_item
                        if not local_folder.is_dir():
                            local_folder.mkdir(parents=True)

                if len(batch) > 0:
                    self.download_batch(batch)
        finally:
            # allow any remaining background downloads to complete
            futures_left = list(self.pool_future_to_media.keys())
            self.do_download_complete(futures_left)
            log.warning(
                'Downloaded %d Items, Failed %d, Already Downloaded %d',
                self.files_downloaded, self.files_download_failed,
                self.files_download_skipped)
            self.bad_ids.store_ids()
            self.bad_ids.report()

    def download_batch(self, batch: Mapping[str, DatabaseMedia]):
        """ Downloads a batch of media items collected in download_photo_media.

        A fresh 'base_url' is required since they have limited lifespan and
        these are obtained by a single call to the service function
        mediaItems.batchGet.
        """
        try:
            response = self._api.mediaItems.batchGet.execute(
                mediaItemIds=batch.keys())
            r_json = response.json()
            if r_json.get('pageToken'):
                log.error("Ops - Batch size too big, some items dropped!")

            for i, result in enumerate(r_json["mediaItemResults"]):
                media_item_json = result.get("mediaItem")
                if not media_item_json:
                    log.warning('Null response in mediaItems.batchGet %s',
                                batch.keys())
                    log.debug(
                        "Null response in mediaItems.batchGet"
                        "for item &d in\n\n %s \n\n which is \n%s", i, r_json,
                        result)
                else:
                    media_item = batch.get(media_item_json["id"])
                    self.download_file(media_item, media_item_json)

        except KeyboardInterrupt:
            log.warning('Cancelling download threads ...')
            for f in self.pool_future_to_media:
                f.cancel()
            futures.wait(self.pool_future_to_media)
            log.warning('Cancelled download threads')
            raise
        except RequestException:
            self.find_bad_items(batch)

    def download_file(self, media_item: DatabaseMedia, media_json: dict):
        """ farms a single media download off to the thread pool.

        Uses a dictionary of Futures -> mediaItem to track downloads that are
        currently scheduled/running. When a Future is done it calls
        do_download_complete to remove the Future from the dictionary and
        complete processing of the media item.
        """
        base_url = media_json['baseUrl']

        # we dont want a massive queue so wait until at least one thread is free
        while len(self.pool_future_to_media) >= self.MAX_THREADS:
            # check which futures are done, complete the main thread work
            # and remove them from the dictionary
            done_list = []
            for future in self.pool_future_to_media.keys():
                if future.done():
                    done_list.append(future)

            self.do_download_complete(done_list)

        # start a new background download
        self.files_download_started += 1
        log.info('downloading %d %s', self.files_download_started,
                 media_item.relative_path)
        future = self.download_pool.submit(self.do_download_file, base_url,
                                           media_item)
        self.pool_future_to_media[future] = media_item

    def do_download_file(self, base_url: str, media_item: DatabaseMedia):
        """ Runs in a process pool and does a download of a single media item.
        """
        local_folder = self._root_folder / media_item.relative_folder
        local_full_path = local_folder / media_item.filename
        if media_item.is_video():
            download_url = '{}=dv'.format(base_url)
            timeout = self.video_timeout
        else:
            download_url = '{}=d'.format(base_url)
            timeout = self.image_timeout
        temp_file = tempfile.NamedTemporaryFile(dir=local_folder, delete=False)
        t_path = Path(temp_file.name)

        try:
            response = self._session.get(download_url,
                                         stream=True,
                                         timeout=timeout)
            response.raise_for_status()
            shutil.copyfileobj(response.raw, temp_file)
            temp_file.close()
            temp_file = None
            response.close()
            t_path.rename(local_full_path)
            os.utime(str(local_full_path),
                     (Utils.safe_timestamp(media_item.modify_date),
                      Utils.safe_timestamp(media_item.create_date)))
            os.chmod(str(local_full_path), 0o666 & ~self.current_umask)
        except KeyboardInterrupt:
            log.debug("User cancelled download thread")
            raise
        finally:
            if temp_file:
                temp_file.close()
            if t_path.exists():
                t_path.unlink()

    def do_download_complete(self, futures_list: Union[Mapping[futures.Future,
                                                               DatabaseMedia],
                                                       List[futures.Future]]):
        """ runs in the main thread and completes processing of a media
        item once (multi threaded) do_download has completed
        """
        for future in futures_list:
            media_item = self.pool_future_to_media.get(future)
            timeout = self.video_timeout if media_item.is_video() else \
                self.image_timeout
            e = future.exception(timeout=timeout)
            if e:
                self.files_download_failed += 1
                log.error('FAILURE %d downloading %s',
                          self.files_download_failed, media_item.relative_path)
                if isinstance(e, RequestException):
                    self.bad_ids.add_id(media_item.relative_path,
                                        media_item.id, media_item.url, e)
                else:
                    raise e
            else:
                self._db.put_downloaded(media_item.id)
                self.files_downloaded += 1
                log.debug('COMPLETED %d downloading %s', self.files_downloaded,
                          media_item.relative_path)
            del self.pool_future_to_media[future]

    def find_bad_items(self, batch: Mapping[str, DatabaseMedia]):
        """
        a batch get failed. Now do all of its contents as individual
        gets so we can work out which ID(s) cause the failure
        """
        for item_id, media_item in batch.items():
            try:
                log.debug('BAD ID Retry on %s (%s)', item_id,
                          media_item.relative_path)
                response = self._api.mediaItems.get.execute(
                    mediaItemId=item_id)
                media_item_json = response.json()
                self.download_file(media_item, media_item_json)
            except RequestException as e:
                self.bad_ids.add_id(str(media_item.relative_path),
                                    media_item.id, media_item.url, e)
                self.files_download_failed += 1
                log.error('FAILURE %d in get of %s BAD ID',
                          self.files_download_failed, media_item.relative_path)
Ejemplo n.º 6
0
class GooglePhotosDownload(object):
    """A Class for managing the indexing and download of Google Photos
    """

    PAGE_SIZE: int = 100
    BATCH_SIZE: int = 40

    def __init__(self, api: RestClient, root_folder: Path, db: LocalData,
                 settings: Settings):
        """
        Parameters:
            api: object representing the Google REST API
            root_folder: path to the root of local file synchronization
            db: local database for indexing
            settings: further arguments
        """
        self._db: LocalData = db
        self._root_folder: Path = root_folder
        self._api: RestClient = api

        self.files_downloaded: int = 0
        self.files_download_started: int = 0
        self.files_download_skipped: int = 0
        self.files_download_failed: int = 0

        self.settings = settings
        self.max_threads = settings.max_threads
        self.start_date: datetime = settings.start_date
        self.end_date: datetime = settings.end_date
        self.retry_download: bool = settings.retry_download
        self.case_insensitive_fs: bool = settings.case_insensitive_fs
        self.video_timeout: int = 2000
        self.image_timeout: int = 60

        # attributes related to multi-threaded download
        self.download_pool = futures.ThreadPoolExecutor(
            max_workers=self.max_threads)
        self.pool_future_to_media = {}
        self.bad_ids = BadIds(self._root_folder)

        self.current_umask = os.umask(7)
        os.umask(self.current_umask)

        self._session = requests.Session()
        # define the retry behaviour for each connection. Note that
        # respect_retry_after_header=True means that status codes [413, 429, 503]
        # will backoff for the recommended period defined in the retry after header
        retries = Retry(
            total=settings.max_retries,
            backoff_factor=5,
            status_forcelist=[500, 502, 503, 504, 509],
            method_whitelist=frozenset(["GET", "POST"]),
            raise_on_status=False,
            respect_retry_after_header=True,
        )
        self._session.mount(
            "https://",
            HTTPAdapter(max_retries=retries, pool_maxsize=self.max_threads))

    def download_photo_media(self):
        """
        here we batch up our requests to get base url for downloading media.
        This avoids the overhead of one REST call per file. A REST call
        takes longer than downloading an image
        """
        def grouper(
            iterable: Iterable[DatabaseMedia],
        ) -> Iterable[Iterable[DatabaseMedia]]:
            """Collect data into chunks size BATCH_SIZE"""
            return zip_longest(*[iter(iterable)] * self.BATCH_SIZE,
                               fillvalue=None)

        if not self.retry_download:
            self.files_download_skipped = self._db.downloaded_count()

        log.warning("Downloading Photos ...")
        try:
            for media_items_block in grouper(
                    self._db.get_rows_by_search(
                        GooglePhotosRow,
                        start_date=self.start_date,
                        end_date=self.end_date,
                        skip_downloaded=not self.retry_download,
                    )):
                batch = {}

                items = (mi for mi in media_items_block if mi)
                for media_item in items:
                    if self.case_insensitive_fs:
                        relative_folder = str(
                            media_item.relative_folder).lower()
                        filename = str(media_item.filename).lower()
                    else:
                        relative_folder = media_item.relative_folder
                        filename = media_item.filename
                    local_folder = self._root_folder / relative_folder
                    local_full_path = local_folder / filename

                    try:
                        if local_full_path.exists():
                            self.files_download_skipped += 1
                            log.debug(
                                "SKIPPED download (file exists) %d %s",
                                self.files_download_skipped,
                                media_item.relative_path,
                            )
                            self._db.put_downloaded(media_item.id)

                        elif self.bad_ids.check_id_ok(media_item.id):
                            batch[media_item.id] = media_item
                            if not local_folder.is_dir():
                                local_folder.mkdir(parents=True)

                    except Exception as err:
                        # skip files with filenames too long for this OS.
                        # probably thrown by local_full_path.exists().
                        errname = type(err).__name__
                        if errname == "OSError" and err.errno == errno.ENAMETOOLONG:
                            log.warning(
                                "SKIPPED file because name is too long for this OS %s",
                                local_full_path,
                            )
                            self.files_download_failed += 1
                        else:
                            # re-raise other errors
                            raise

                if len(batch) > 0:
                    self.download_batch(batch)
        finally:
            # allow any remaining background downloads to complete
            futures_left = list(self.pool_future_to_media.keys())
            self.do_download_complete(futures_left)
            log.warning(
                "Downloaded %d Items, Failed %d, Already Downloaded %d",
                self.files_downloaded,
                self.files_download_failed,
                self.files_download_skipped,
            )
            self.bad_ids.store_ids()
            self.bad_ids.report()
        return self.files_downloaded

    def download_batch(self, batch: Mapping[str, DatabaseMedia]):
        """ Downloads a batch of media items collected in download_photo_media.

        A fresh 'base_url' is required since they have limited lifespan and
        these are obtained by a single call to the service function
        mediaItems.batchGet.
        """
        try:
            response = self._api.mediaItems.batchGet.execute(
                mediaItemIds=batch.keys())
            r_json = response.json()
            if r_json.get("pageToken"):
                log.error("Ops - Batch size too big, some items dropped!")

            for i, result in enumerate(r_json["mediaItemResults"]):
                media_item_json = result.get("mediaItem")
                if not media_item_json:
                    log.warning("Null response in mediaItems.batchGet %s",
                                batch.keys())
                    log.debug(
                        "Null response in mediaItems.batchGet"
                        "for item %d in\n\n %s \n\n which is \n%s",
                        i,
                        str(r_json),
                        str(result),
                    )
                else:
                    media_item = batch.get(media_item_json["id"])
                    self.download_file(media_item, media_item_json)
        except RequestException:
            self.find_bad_items(batch)

        except KeyboardInterrupt:
            log.warning("Cancelling download threads ...")
            for f in self.pool_future_to_media:
                f.cancel()
            futures.wait(self.pool_future_to_media)
            log.warning("Cancelled download threads")
            raise

    def download_file(self, media_item: DatabaseMedia, media_json: dict):
        """ farms a single media download off to the thread pool.

        Uses a dictionary of Futures -> mediaItem to track downloads that are
        currently scheduled/running. When a Future is done it calls
        do_download_complete to remove the Future from the dictionary and
        complete processing of the media item.
        """
        base_url = media_json["baseUrl"]

        # we dont want a massive queue so wait until at least one thread is free
        while len(self.pool_future_to_media) >= self.max_threads:
            # check which futures are done, complete the main thread work
            # and remove them from the dictionary
            done_list = []
            for future in self.pool_future_to_media.keys():
                if future.done():
                    done_list.append(future)

            self.do_download_complete(done_list)

        # start a new background download
        self.files_download_started += 1
        log.info("downloading %d %s", self.files_download_started,
                 media_item.relative_path)
        future = self.download_pool.submit(self.do_download_file, base_url,
                                           media_item)
        self.pool_future_to_media[future] = media_item

    def do_download_file(self, base_url: str, media_item: DatabaseMedia):
        """ Runs in a process pool and does a download of a single media item.
        """
        if self.case_insensitive_fs:
            relative_folder = str(media_item.relative_folder).lower()
            filename = str(media_item.filename).lower()
        else:
            relative_folder = media_item.relative_folder
            filename = media_item.filename
        local_folder = self._root_folder / relative_folder
        local_full_path = local_folder / filename

        if media_item.is_video():
            download_url = "{}=dv".format(base_url)
            timeout = self.video_timeout
        else:
            download_url = "{}=d".format(base_url)
            timeout = self.image_timeout
        temp_file = tempfile.NamedTemporaryFile(dir=local_folder, delete=False)
        t_path = Path(temp_file.name)

        try:
            response = self._session.get(download_url,
                                         stream=True,
                                         timeout=timeout)
            response.raise_for_status()
            shutil.copyfileobj(response.raw, temp_file)
            temp_file.close()
            temp_file = None
            response.close()
            t_path.rename(local_full_path)
            create_date = Utils.safe_timestamp(media_item.create_date)
            os.utime(
                str(local_full_path),
                (
                    Utils.safe_timestamp(media_item.modify_date).timestamp(),
                    create_date.timestamp(),
                ),
            )
            if _use_win_32:
                file_handle = win32file.CreateFile(
                    str(local_full_path),
                    win32file.GENERIC_WRITE,
                    0,
                    None,
                    win32con.OPEN_EXISTING,
                    0,
                    None,
                )
                win32file.SetFileTime(file_handle, *(create_date, ) * 3)
                file_handle.close()
            os.chmod(str(local_full_path), 0o666 & ~self.current_umask)
        except KeyboardInterrupt:
            log.debug("User cancelled download thread")
            raise
        finally:
            if temp_file:
                temp_file.close()
            if t_path.exists():
                t_path.unlink()

    def do_download_complete(
        self,
        futures_list: Union[Mapping[futures.Future, DatabaseMedia],
                            List[futures.Future]],
    ):
        """ runs in the main thread and completes processing of a media
        item once (multi threaded) do_download has completed
        """
        for future in futures_list:
            media_item = self.pool_future_to_media.get(future)
            timeout = (self.video_timeout
                       if media_item.is_video() else self.image_timeout)
            e = future.exception(timeout=timeout)
            if e:
                self.files_download_failed += 1
                log.error(
                    "FAILURE %d downloading %s - %s",
                    self.files_download_failed,
                    media_item.relative_path,
                    e,
                )
                # treat API errors as possibly transient. Report them above in
                # log.error but do not raise them. Other exceptions will raise
                # up to the root handler and abort. Note that all retry logic is
                # already handled in urllib3

                # Items that cause API errors go in a BadIds file which must
                # be deleted to retry these items.
                if isinstance(e, RequestException):
                    self.bad_ids.add_id(media_item.relative_path,
                                        media_item.id, media_item.url, e)
                else:
                    raise e
            else:
                self._db.put_downloaded(media_item.id)
                self.files_downloaded += 1
                log.debug(
                    "COMPLETED %d downloading %s",
                    self.files_downloaded,
                    media_item.relative_path,
                )
                if self.settings.progress and self.files_downloaded % 10 == 0:
                    log.warning(
                        f"Downloaded {self.files_downloaded} items ...\033[F")
            del self.pool_future_to_media[future]

    def find_bad_items(self, batch: Mapping[str, DatabaseMedia]):
        """
        a batch get failed. Now do all of its contents as individual
        gets so we can work out which ID(s) cause the failure
        """
        for item_id, media_item in batch.items():
            try:
                log.debug("BAD ID Retry on %s (%s)", item_id,
                          media_item.relative_path)
                response = self._api.mediaItems.get.execute(
                    mediaItemId=item_id)
                media_item_json = response.json()
                self.download_file(media_item, media_item_json)
            except RequestException as e:
                self.bad_ids.add_id(str(media_item.relative_path),
                                    media_item.id, media_item.url, e)
                self.files_download_failed += 1
                log.error(
                    "FAILURE %d in get of %s BAD ID",
                    self.files_download_failed,
                    media_item.relative_path,
                )