Esempio n. 1
0
    def load_files(self,
                   fallback_city: str,
                   max_workers: Optional[int] = None) -> Tuple[int, int]:
        """Downloads and analyses the actual file for the file entries in the database.

        Returns the number of successful and failed files"""
        # This is partially bound by waiting on external resources, but mostly very cpu intensive,
        # so we can spawn a bunch of processes to make this a lot faster.
        # We need to build a list because mysql connections and process pools don't pair well.
        files = list(
            File.objects.filter(
                filesize__isnull=True,
                oparl_access_url__isnull=False).order_by("-id").values_list(
                    "id", flat=True))
        logger.info("Downloading and analysing {} files".format(len(files)))
        address_pipeline = AddressPipeline(create_geoextract_data())
        pbar = None
        if sys.stdout.isatty() and not settings.TESTING:
            pbar = tqdm(total=len(files))
        failed = 0
        successful = 0

        if not self.force_singlethread:
            # We need to close the database connections, which will be automatically reopen for
            # each process
            # See https://stackoverflow.com/a/10684672/3549270
            # and https://brobin.me/blog/2017/05/mutiprocessing-in-python-django-management-commands/
            db.connections.close_all()

            with ProcessPoolExecutor(max_workers=max_workers) as executor:
                for succeeded in executor.map(
                        self.download_and_analyze_file,
                        files,
                        repeat(address_pipeline),
                        repeat(fallback_city),
                ):
                    if not succeeded:
                        failed += 1
                    if pbar:
                        pbar.update()

        else:
            for file in files:
                succeeded = self.download_and_analyze_file(
                    file, address_pipeline, fallback_city)

                if not succeeded:
                    failed += 1
                else:
                    successful += 1

                if pbar:
                    pbar.update()
        if pbar:
            pbar.close()

        if failed > 0:
            logger.error("{} files failed to download".format(failed))

        return successful, failed
Esempio n. 2
0
    def handle(self, *args, **options):
        importer, body = self.get_importer(options)
        if options["ids"]:
            address_pipeline = AddressPipeline(create_geoextract_data())
            failed = 0
            for file in options["ids"]:
                succeeded = importer.download_and_analyze_file(
                    file, address_pipeline, body.short_name)

                if not succeeded:
                    failed += 1

            if failed > 0:
                logger.error("{} files failed to download".format(failed))
        else:
            importer.load_files(max_workers=options["max_workers"],
                                fallback_city=body.short_name)