def test_rebuild_for_passim(): input_bucket_name = S3_CANONICAL_BUCKET outp_dir = pkg_resources.resource_filename('impresso_commons', 'data/rebuilt-passim') input_issues = read_s3_issues("luxwort", "1848", input_bucket_name) issue_key, json_files = rebuild_issues(issues=input_issues[:50], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='passim', filter_language=['fr']) logger.info(f'{issue_key}: {json_files}')
def test_rebuild_solr(newspaper_id: str, year: int, limit: int): input_bucket_name = S3_CANONICAL_BUCKET outp_dir = pkg_resources.resource_filename('impresso_commons', 'data/rebuilt') input_issues = read_s3_issues(newspaper_id, year, input_bucket_name) print(f'{newspaper_id}/{year}: {len(input_issues)} issues to rebuild') print(f'limiting test rebuild to first {limit} issues.') issue_key, json_files = rebuild_issues(issues=input_issues[:limit], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr') result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def test_rebuild_JDG2(): input_bucket_name = "s3://original-canonical-fixed" outp_dir = pkg_resources.resource_filename( 'impresso_commons', 'data/rebuilt' ) input_issues = read_s3_issues("JDG", "1862", input_bucket_name) print(f'{len(input_issues)} issues to rebuild') issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr' ) result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def test_rebuild_indeplux(): input_bucket_name = "s3://TRANSFER" outp_dir = pkg_resources.resource_filename( 'impresso_commons', 'data/rebuilt' ) input_issues = read_s3_issues("indeplux", "1905", input_bucket_name) print(f'{len(input_issues)} issues to rebuild') issue_key, json_files = rebuild_issues( issues=input_issues[:50], input_bucket=input_bucket_name, output_dir=outp_dir, dask_client=client, format='solr', filter_language=['fr'] ) result = compress(issue_key, json_files, outp_dir) logger.info(result) assert result is not None
def main(): def signal_handler(*args): # Handle any cleanup here print('SIGINT or CTRL-C detected. Exiting gracefully' ' and shutting down the dask kubernetes cluster') if cluster: cluster.close() exit(0) arguments = docopt(__doc__) clear_output = arguments["--clear"] bucket_name = f's3://{arguments["--input-bucket"]}' output_bucket_name = arguments["--output-bucket"] outp_dir = arguments["--output-dir"] filter_config_file = arguments["--filter-config"] output_format = arguments["--format"] scheduler = arguments["--scheduler"] log_file = arguments["--log-file"] launch_kubernetes = arguments["--k8"] log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO languages = arguments["--languages"] signal.signal(signal.SIGINT, signal_handler) if languages: languages = languages.split(',') init_logging(log_level, log_file) # clean output directory if existing if outp_dir is not None and os.path.exists(outp_dir): if clear_output is not None and clear_output: shutil.rmtree(outp_dir) os.mkdir(outp_dir) with open(filter_config_file, 'r') as file: config = json.load(file) # start the dask local cluster if scheduler is None: if launch_kubernetes: cluster = DaskCluster( namespace="dhlab", cluster_id="impresso-pycommons-k8-rebuild", scheduler_pod_spec=make_scheduler_configuration(), worker_pod_spec=make_worker_configuration( docker_image= "ic-registry.epfl.ch/dhlab/impresso_pycommons:v1", memory="5G")) try: cluster.create() cluster.scale(50, blocking=True) client = cluster.make_dask_client() print(client.get_versions(check=False)) except Exception as e: print(e) cluster.close() exit(0) print(client) else: cluster = None client = Client(processes=False, n_workers=8, threads_per_worker=1) else: cluster = None client = Client(scheduler) logger.info(f"Dask cluster: {client}") if arguments["rebuild_articles"]: try: for n, batch in enumerate(config): rebuilt_issues = [] print(f'Processing batch {n + 1}/{len(config)} [{batch}]') newspaper = list(batch.keys())[0] start_year, end_year = batch[newspaper] for year in range(start_year, end_year): print(f'Processing year {year}') print('Retrieving issues...') try: input_issues = read_s3_issues(newspaper, year, bucket_name) except FileNotFoundError: print(f'{newspaper}-{year} not found in {bucket_name}') continue issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=bucket_name, output_dir=outp_dir, dask_client=client, format=output_format, filter_language=languages) rebuilt_issues.append((issue_key, json_files)) print((f"Uploading {len(rebuilt_issues)} rebuilt bz2files " f"to {output_bucket_name}")) b = db.from_sequence(rebuilt_issues) \ .starmap(compress, output_dir=outp_dir) \ .starmap(upload, bucket_name=output_bucket_name) \ .starmap(cleanup) future = b.persist() progress(future) except Exception as e: traceback.print_tb(e.__traceback__) print(e) if cluster: cluster.close() finally: if cluster: cluster.close() elif arguments["rebuild_pages"]: print("\nFunction not yet implemented (sorry!).\n")
def main(): arguments = docopt(__doc__) clear_output = arguments["--clear"] bucket_name = f's3://{arguments["--input-bucket"]}' output_bucket_name = arguments["--output-bucket"] outp_dir = arguments["--output-dir"] filter_config_file = arguments["--filter-config"] output_format = arguments["--format"] scheduler = arguments["--scheduler"] log_file = arguments["--log-file"] log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO languages = arguments["--languages"] if languages: languages = languages.split(',') init_logging(log_level, log_file) # clean output directory if existing if outp_dir is not None and os.path.exists(outp_dir): if clear_output is not None and clear_output: shutil.rmtree(outp_dir) os.mkdir(outp_dir) with open(filter_config_file, 'r') as file: config = json.load(file) # start the dask local cluster if scheduler is None: client = Client(processes=False, n_workers=8, threads_per_worker=1) else: client = Client(scheduler) logger.info(f"Dask cluster: {client}") if arguments["rebuild_articles"]: rebuilt_issues = [] for n, batch in enumerate(config): print(f'Processing batch {n + 1}/{len(config)} [{batch}]') newspaper = list(batch.keys())[0] start_year, end_year = batch[newspaper] for year in range(start_year, end_year): print(f'Processing year {year}') print('Retrieving issues...') try: input_issues = read_s3_issues( newspaper, year, bucket_name ) except FileNotFoundError: print(f'{newspaper}-{year} not found in {bucket_name}') continue issue_key, json_files = rebuild_issues( issues=input_issues, input_bucket=bucket_name, output_dir=outp_dir, dask_client=client, format=output_format, filter_language=languages ) rebuilt_issues.append((issue_key, json_files)) b = db.from_sequence(rebuilt_issues) \ .starmap(compress, output_dir=outp_dir) \ .starmap(upload, bucket_name=output_bucket_name) \ .starmap(cleanup) future = b.persist() progress(future) elif arguments["rebuild_pages"]: print("\nFunction not yet implemented (sorry!).\n")