def __init__(self, api_config: Union[API, int, str], skip_existing_files: bool = False, overwrite_on_extract=True, show_progress: bool = False, concurrency: str = 'sync', on_disk_check='full'): if type(api_config) is int: self.api = get_session().query(API).filter( API.id == api_config).one() elif type(api_config) is str: self.api = get_session().query(API).filter( API.name == api_config).one() elif isinstance(api_config, API): self.api = api_config self.headers = {} self.auth = None self.auth_token = None self.skip_existing_files = skip_existing_files self.show_progress = show_progress self.concurrency = concurrency self.on_disk_check = on_disk_check if self.api.creds: details = self.api.creds.creds_details cookie = details.get('cookie', None) if cookie: self.headers['Cookie'] = cookie['name'] + '=' + cookie['value'] self.auth = details.get('auth', None) self.auth_token = details.get('auth_token', None) if self.auth_token: self.headers[ self.auth_token['header']] = self.auth_token['token']
def cached_files_on_disk(self, use_hash=True, missing=False, limit_ids=None) -> Query: """ Retrieve a list of all cached files thought to (or known to) be on disk. :param use_hash: if true, use the fact that the file has a hash in the db as evidence of existence. if false, actually checks whether the file is present at its path. :param missing: return any files that are configured but missing from disk. :param limit_ids: limit the result set to the specific ids supplied. :return: a batched query """ # actually checking existence can get pretty expensive for the # scenario where you have lots of files on a distributed file system # so if we're pretty sure we haven't deleted the files manually, we # can trust that if a hash is present in the db, then the file exists files = get_session().query(CachedFile).join( Source, CachedFile.source_id == Source.id).filter( Source.api_config_id == self.id) if use_hash: if missing: files = files.filter(CachedFile.hash.is_(None)) else: files = files.filter(CachedFile.hash.isnot(None)) if limit_ids: files = files.filter(CachedFile.id.in_(limit_ids)) return files.yield_per(self.BATCH_SIZE) else: q: Query = get_session().query( Source.data_dir, CachedFile.path, CachedFile.id).join(Source, CachedFile.source_id == Source.id).filter( Source.api_config_id == self.id) file_ids = set() for batch in chunked(q.yield_per(self.BATCH_SIZE), self.BATCH_SIZE): file_ids = { item[2] for item in batch if (Path(item[0]) / item[1]).resolve().exists() } if limit_ids: file_ids = file_ids & set(limit_ids) if missing: file_ids = {item[2] for item in batch} - file_ids return files.filter(CachedFile.id.in_(file_ids)).yield_per( self.BATCH_SIZE)
def _generate_expected_files(self, extract_dir: Path, archived_paths: Set[Path], expected_paths: Set[Path]) -> Set[Path]: """ Generates expected file entries in the table if they do not already exist. :param extract_dir: The directory to which the archive is to be extracted. :param archived_paths: The list of paths contained in the archive. :param expected_paths: The list of expected files. :return: A set of paths for which expected files have been generated """ session = get_session() missing_paths = { path for path in archived_paths if extract_dir / path not in expected_paths } expected_files = [ ExpectedFile(path=str(extract_dir / path), cached_file=self) for path in missing_paths ] session.bulk_save_objects(expected_files) session.commit() return missing_paths
def cached_files(self) -> Query: """ Retrieve a list of all :class:`CachedFile` configured for this api and stored in the database. :return: a batched query object. """ q: Query = get_session().query(CachedFile).join( Source, CachedFile.source_id == Source.id).filter( Source.api_config_id == self.id) return q.yield_per(self.BATCH_SIZE)
def expected_files(self) -> Query: """ Retrieve all the expected files under this API. :return: A list of expected files. """ q = get_session().query(ExpectedFile).join( CachedFile, ExpectedFile.cached_file_id == CachedFile.id).join( Source, CachedFile.source_id == Source.id).filter( Source.api_config_id == self.id) return q.yield_per(self.BATCH_SIZE)
def _update_file_cache(source_file: CachedFile, target_file: Path): """ Deprecated. :param source_file: :param target_file: :return: """ session = get_session() source_file.hash = file_hash(target_file).hexdigest() source_file.last_download = datetime.now() source_file.size = target_file.stat().st_size session.add(source_file) session.commit()
def get_or_create(model, session=None, **kwargs): session = session or get_session() try: return session.query(model).filter_by(**kwargs).one(), False except NoResultFound: created = model(**kwargs) try: session.add(created) session.commit() return created, True except IntegrityError: session.rollback() return session.query(model).filter_by(**kwargs).one(), False
def setup(self): """ All subclasses of API must implement the setup method to generate the actual configuration that will specify what is to be downloaded. :return: """ if not self.name: self.name = self.__class__.__name__ session = get_session() existing_api = session.query(API).filter( API.name == self.name).one_or_none() if not existing_api: session.add(self) session.commit() else: self.id = existing_api.id
def session(engine): from ketl.db.settings import get_session return get_session()
def extract(self) -> List[Path]: """ Run the extractor. Attempts to minimize the amount of repeated work by checking which cached files actually exist, whether on disk or in the database, and batching downloads. Optionally distributes the work across processes if the `concurrency` parameter is set to `multiprocess`. :return: a list of paths corresponding to all the :class:`ExpectedFile` s that the extractor's API is responsible for. """ session = get_session() # depending on whether we are skipping files known to be on disk # we produce an iterable that is either a list of queries that will # give us the files that are missing, or a chunked version of a query if self.skip_existing_files: kwargs = { 'missing': True, 'use_hash': self.on_disk_check == 'hash' } data_iterator: Query = self.api.cached_files_on_disk(**kwargs) else: data_iterator: Query = self.api.cached_files data_iterator = data_iterator.options(defer(CachedFile.meta)) collected_results = [] for batch in tqdm(chunked(data_iterator, 10000)): # type: List[CachedFile] if self.concurrency == 'sync': results = list( filter(None, [ self.get_file(cached_file.id, cached_file.full_url, cached_file.full_path, cached_file.refresh_interval, cached_file.url_params, show_progress=self.show_progress) for cached_file in batch ])) collected_results.extend(results) elif self.concurrency == 'async': # pargma: no cover raise NotImplementedError( 'Async downloads not yet implemented.') # pragma: no cover elif self.concurrency == 'multiprocess': get_file_args = [ (cached_file.id, cached_file.full_url, cached_file.full_path, cached_file.refresh_interval, cached_file.url_params, self.show_progress) for cached_file in batch ] if get_file_args: with Pool() as pool: futures = pool.starmap_async(self.get_file, get_file_args) results = futures.get() if results: results = list(filter(None, results)) collected_results.extend(results) pool.join() session.bulk_update_mappings(CachedFile, collected_results) session.commit() new_expected_files: List[dict] = [] updated_expected_files: List[dict] = [] q: Query = session.query( ExpectedFile.path, ExpectedFile.cached_file_id, ExpectedFile.id).join( CachedFile, ExpectedFile.cached_file_id == CachedFile.id).join( Source, CachedFile.source_id == Source.id).filter( Source.api_config_id == self.api.id) current_files = {(ef[0], ef[1]): ef[2] for ef in q.yield_per(10000)} for source_file in data_iterator: ef = source_file.preprocess() if ef: key = (ef['path'], ef['cached_file_id']) if key not in current_files: new_expected_files.append(ef) else: updated_expected_files.append({ 'id': current_files[key], **ef }) session.bulk_insert_mappings(ExpectedFile, new_expected_files) session.bulk_update_mappings(ExpectedFile, updated_expected_files) session.commit() return [Path(ef.path) for ef in self.api.expected_files]
class Meta: model = models.API sqlalchemy_session = get_session() sqlalchemy_session_persistence = 'commit'
class Meta: model = models.ExpectedFile sqlalchemy_session = get_session() sqlalchemy_session_persistence = 'commit'