Ejemplo n.º 1
0
    def __init__(self,
                 api_config: Union[API, int, str],
                 skip_existing_files: bool = False,
                 overwrite_on_extract=True,
                 show_progress: bool = False,
                 concurrency: str = 'sync',
                 on_disk_check='full'):

        if type(api_config) is int:
            self.api = get_session().query(API).filter(
                API.id == api_config).one()
        elif type(api_config) is str:
            self.api = get_session().query(API).filter(
                API.name == api_config).one()
        elif isinstance(api_config, API):
            self.api = api_config
        self.headers = {}
        self.auth = None
        self.auth_token = None
        self.skip_existing_files = skip_existing_files
        self.show_progress = show_progress
        self.concurrency = concurrency
        self.on_disk_check = on_disk_check

        if self.api.creds:
            details = self.api.creds.creds_details
            cookie = details.get('cookie', None)
            if cookie:
                self.headers['Cookie'] = cookie['name'] + '=' + cookie['value']
            self.auth = details.get('auth', None)
            self.auth_token = details.get('auth_token', None)
            if self.auth_token:
                self.headers[
                    self.auth_token['header']] = self.auth_token['token']
Ejemplo n.º 2
0
    def cached_files_on_disk(self,
                             use_hash=True,
                             missing=False,
                             limit_ids=None) -> Query:
        """ Retrieve a list of all cached files thought to (or known to) be on disk.

        :param use_hash: if true, use the fact that the file has a hash in the db as evidence of existence.
            if false, actually checks whether the file is present at its path.
        :param missing: return any files that are configured but missing from disk.
        :param limit_ids: limit the result set to the specific ids supplied.
        :return: a batched query
        """

        # actually checking existence can get pretty expensive for the
        # scenario where you have lots of files on a distributed file system
        # so if we're pretty sure we haven't deleted the files manually, we
        # can trust that if a hash is present in the db, then the file exists

        files = get_session().query(CachedFile).join(
            Source, CachedFile.source_id == Source.id).filter(
                Source.api_config_id == self.id)

        if use_hash:
            if missing:
                files = files.filter(CachedFile.hash.is_(None))
            else:
                files = files.filter(CachedFile.hash.isnot(None))

            if limit_ids:
                files = files.filter(CachedFile.id.in_(limit_ids))

            return files.yield_per(self.BATCH_SIZE)

        else:
            q: Query = get_session().query(
                Source.data_dir, CachedFile.path,
                CachedFile.id).join(Source,
                                    CachedFile.source_id == Source.id).filter(
                                        Source.api_config_id == self.id)

            file_ids = set()
            for batch in chunked(q.yield_per(self.BATCH_SIZE),
                                 self.BATCH_SIZE):
                file_ids = {
                    item[2]
                    for item in batch
                    if (Path(item[0]) / item[1]).resolve().exists()
                }

            if limit_ids:
                file_ids = file_ids & set(limit_ids)

            if missing:
                file_ids = {item[2] for item in batch} - file_ids

            return files.filter(CachedFile.id.in_(file_ids)).yield_per(
                self.BATCH_SIZE)
Ejemplo n.º 3
0
    def _generate_expected_files(self, extract_dir: Path,
                                 archived_paths: Set[Path],
                                 expected_paths: Set[Path]) -> Set[Path]:
        """ Generates expected file entries in the table if they do not already exist.

        :param extract_dir: The directory to which the archive is to be extracted.
        :param archived_paths: The list of paths contained in the archive.
        :param expected_paths: The list of expected files.
        :return: A set of paths for which expected files have been generated
        """
        session = get_session()

        missing_paths = {
            path
            for path in archived_paths
            if extract_dir / path not in expected_paths
        }
        expected_files = [
            ExpectedFile(path=str(extract_dir / path), cached_file=self)
            for path in missing_paths
        ]
        session.bulk_save_objects(expected_files)
        session.commit()

        return missing_paths
Ejemplo n.º 4
0
    def cached_files(self) -> Query:
        """ Retrieve a list of all :class:`CachedFile` configured for this
        api and stored in the database.

        :return: a batched query object.
        """

        q: Query = get_session().query(CachedFile).join(
            Source, CachedFile.source_id == Source.id).filter(
                Source.api_config_id == self.id)

        return q.yield_per(self.BATCH_SIZE)
Ejemplo n.º 5
0
    def expected_files(self) -> Query:
        """ Retrieve all the expected files under this API.

        :return: A list of expected files.
        """

        q = get_session().query(ExpectedFile).join(
            CachedFile, ExpectedFile.cached_file_id == CachedFile.id).join(
                Source, CachedFile.source_id == Source.id).filter(
                    Source.api_config_id == self.id)

        return q.yield_per(self.BATCH_SIZE)
Ejemplo n.º 6
0
    def _update_file_cache(source_file: CachedFile, target_file: Path):
        """ Deprecated.

        :param source_file:
        :param target_file:
        :return:
        """
        session = get_session()
        source_file.hash = file_hash(target_file).hexdigest()
        source_file.last_download = datetime.now()
        source_file.size = target_file.stat().st_size
        session.add(source_file)
        session.commit()
Ejemplo n.º 7
0
def get_or_create(model, session=None, **kwargs):

    session = session or get_session()
    try:
        return session.query(model).filter_by(**kwargs).one(), False
    except NoResultFound:
        created = model(**kwargs)
        try:
            session.add(created)
            session.commit()
            return created, True
        except IntegrityError:
            session.rollback()
            return session.query(model).filter_by(**kwargs).one(), False
Ejemplo n.º 8
0
    def setup(self):
        """ All subclasses of API must implement the setup method to generate the actual configuration
        that will specify what is to be downloaded.

        :return:
        """
        if not self.name:
            self.name = self.__class__.__name__

        session = get_session()
        existing_api = session.query(API).filter(
            API.name == self.name).one_or_none()
        if not existing_api:
            session.add(self)
            session.commit()
        else:
            self.id = existing_api.id
Ejemplo n.º 9
0
def session(engine):

    from ketl.db.settings import get_session

    return get_session()
Ejemplo n.º 10
0
    def extract(self) -> List[Path]:
        """ Run the extractor. Attempts to minimize the amount of repeated work by checking
            which cached files actually exist, whether on disk or in the database, and batching
            downloads. Optionally distributes the work across processes if the `concurrency`
            parameter is set to `multiprocess`.

        :return: a list of paths corresponding to all the :class:`ExpectedFile` s that the
            extractor's API is responsible for.
        """

        session = get_session()

        # depending on whether we are skipping files known to be on disk
        # we produce an iterable that is either a list of queries that will
        # give us the files that are missing, or a chunked version of a query
        if self.skip_existing_files:
            kwargs = {
                'missing': True,
                'use_hash': self.on_disk_check == 'hash'
            }
            data_iterator: Query = self.api.cached_files_on_disk(**kwargs)
        else:
            data_iterator: Query = self.api.cached_files

        data_iterator = data_iterator.options(defer(CachedFile.meta))
        collected_results = []

        for batch in tqdm(chunked(data_iterator,
                                  10000)):  # type: List[CachedFile]

            if self.concurrency == 'sync':
                results = list(
                    filter(None, [
                        self.get_file(cached_file.id,
                                      cached_file.full_url,
                                      cached_file.full_path,
                                      cached_file.refresh_interval,
                                      cached_file.url_params,
                                      show_progress=self.show_progress)
                        for cached_file in batch
                    ]))
                collected_results.extend(results)

            elif self.concurrency == 'async':  # pargma: no cover
                raise NotImplementedError(
                    'Async downloads not yet implemented.')  # pragma: no cover
            elif self.concurrency == 'multiprocess':
                get_file_args = [
                    (cached_file.id, cached_file.full_url,
                     cached_file.full_path, cached_file.refresh_interval,
                     cached_file.url_params, self.show_progress)
                    for cached_file in batch
                ]

                if get_file_args:
                    with Pool() as pool:
                        futures = pool.starmap_async(self.get_file,
                                                     get_file_args)
                        results = futures.get()
                        if results:
                            results = list(filter(None, results))
                            collected_results.extend(results)
                    pool.join()

            session.bulk_update_mappings(CachedFile, collected_results)

        session.commit()

        new_expected_files: List[dict] = []
        updated_expected_files: List[dict] = []

        q: Query = session.query(
            ExpectedFile.path,
            ExpectedFile.cached_file_id, ExpectedFile.id).join(
                CachedFile, ExpectedFile.cached_file_id == CachedFile.id).join(
                    Source, CachedFile.source_id == Source.id).filter(
                        Source.api_config_id == self.api.id)

        current_files = {(ef[0], ef[1]): ef[2] for ef in q.yield_per(10000)}

        for source_file in data_iterator:
            ef = source_file.preprocess()
            if ef:
                key = (ef['path'], ef['cached_file_id'])
                if key not in current_files:
                    new_expected_files.append(ef)
                else:
                    updated_expected_files.append({
                        'id': current_files[key],
                        **ef
                    })

        session.bulk_insert_mappings(ExpectedFile, new_expected_files)
        session.bulk_update_mappings(ExpectedFile, updated_expected_files)
        session.commit()

        return [Path(ef.path) for ef in self.api.expected_files]
Ejemplo n.º 11
0
 class Meta:
     model = models.API
     sqlalchemy_session = get_session()
     sqlalchemy_session_persistence = 'commit'
Ejemplo n.º 12
0
 class Meta:
     model = models.ExpectedFile
     sqlalchemy_session = get_session()
     sqlalchemy_session_persistence = 'commit'