Example #1
0
def test_rebuild_for_passim():
    input_bucket_name = S3_CANONICAL_BUCKET
    outp_dir = pkg_resources.resource_filename('impresso_commons',
                                               'data/rebuilt-passim')

    input_issues = read_s3_issues("luxwort", "1848", input_bucket_name)

    issue_key, json_files = rebuild_issues(issues=input_issues[:50],
                                           input_bucket=input_bucket_name,
                                           output_dir=outp_dir,
                                           dask_client=client,
                                           format='passim',
                                           filter_language=['fr'])
    logger.info(f'{issue_key}: {json_files}')
Example #2
0
def test_rebuild_solr(newspaper_id: str, year: int, limit: int):
    input_bucket_name = S3_CANONICAL_BUCKET
    outp_dir = pkg_resources.resource_filename('impresso_commons',
                                               'data/rebuilt')

    input_issues = read_s3_issues(newspaper_id, year, input_bucket_name)
    print(f'{newspaper_id}/{year}: {len(input_issues)} issues to rebuild')
    print(f'limiting test rebuild to first {limit} issues.')

    issue_key, json_files = rebuild_issues(issues=input_issues[:limit],
                                           input_bucket=input_bucket_name,
                                           output_dir=outp_dir,
                                           dask_client=client,
                                           format='solr')

    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
def test_rebuild_JDG2():
    input_bucket_name = "s3://original-canonical-fixed"
    outp_dir = pkg_resources.resource_filename(
        'impresso_commons',
        'data/rebuilt'
    )
    
    input_issues = read_s3_issues("JDG", "1862", input_bucket_name)
    print(f'{len(input_issues)} issues to rebuild')

    issue_key, json_files = rebuild_issues(
        issues=input_issues,
        input_bucket=input_bucket_name,
        output_dir=outp_dir,
        dask_client=client,
        format='solr'
    )
    
    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
def test_rebuild_indeplux():
    input_bucket_name = "s3://TRANSFER"
    outp_dir = pkg_resources.resource_filename(
        'impresso_commons',
        'data/rebuilt'
    )

    input_issues = read_s3_issues("indeplux", "1905", input_bucket_name)
    print(f'{len(input_issues)} issues to rebuild')

    issue_key, json_files = rebuild_issues(
        issues=input_issues[:50],
        input_bucket=input_bucket_name,
        output_dir=outp_dir,
        dask_client=client,
        format='solr',
        filter_language=['fr']
    )

    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
Example #5
0
def main():
    def signal_handler(*args):
        # Handle any cleanup here
        print('SIGINT or CTRL-C detected. Exiting gracefully'
              ' and shutting down the dask kubernetes cluster')
        if cluster:
            cluster.close()
        exit(0)

    arguments = docopt(__doc__)
    clear_output = arguments["--clear"]
    bucket_name = f's3://{arguments["--input-bucket"]}'
    output_bucket_name = arguments["--output-bucket"]
    outp_dir = arguments["--output-dir"]
    filter_config_file = arguments["--filter-config"]
    output_format = arguments["--format"]
    scheduler = arguments["--scheduler"]
    log_file = arguments["--log-file"]
    launch_kubernetes = arguments["--k8"]
    log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO
    languages = arguments["--languages"]

    signal.signal(signal.SIGINT, signal_handler)

    if languages:
        languages = languages.split(',')

    init_logging(log_level, log_file)

    # clean output directory if existing
    if outp_dir is not None and os.path.exists(outp_dir):
        if clear_output is not None and clear_output:
            shutil.rmtree(outp_dir)
            os.mkdir(outp_dir)

    with open(filter_config_file, 'r') as file:
        config = json.load(file)

    # start the dask local cluster
    if scheduler is None:
        if launch_kubernetes:
            cluster = DaskCluster(
                namespace="dhlab",
                cluster_id="impresso-pycommons-k8-rebuild",
                scheduler_pod_spec=make_scheduler_configuration(),
                worker_pod_spec=make_worker_configuration(
                    docker_image=
                    "ic-registry.epfl.ch/dhlab/impresso_pycommons:v1",
                    memory="5G"))
            try:
                cluster.create()
                cluster.scale(50, blocking=True)
                client = cluster.make_dask_client()
                print(client.get_versions(check=False))
            except Exception as e:
                print(e)
                cluster.close()
                exit(0)

            print(client)
        else:
            cluster = None
            client = Client(processes=False, n_workers=8, threads_per_worker=1)
    else:
        cluster = None
        client = Client(scheduler)
    logger.info(f"Dask cluster: {client}")

    if arguments["rebuild_articles"]:

        try:
            for n, batch in enumerate(config):
                rebuilt_issues = []
                print(f'Processing batch {n + 1}/{len(config)} [{batch}]')
                newspaper = list(batch.keys())[0]
                start_year, end_year = batch[newspaper]

                for year in range(start_year, end_year):
                    print(f'Processing year {year}')
                    print('Retrieving issues...')
                    try:
                        input_issues = read_s3_issues(newspaper, year,
                                                      bucket_name)
                    except FileNotFoundError:
                        print(f'{newspaper}-{year} not found in {bucket_name}')
                        continue

                    issue_key, json_files = rebuild_issues(
                        issues=input_issues,
                        input_bucket=bucket_name,
                        output_dir=outp_dir,
                        dask_client=client,
                        format=output_format,
                        filter_language=languages)
                    rebuilt_issues.append((issue_key, json_files))

                print((f"Uploading {len(rebuilt_issues)} rebuilt bz2files "
                       f"to {output_bucket_name}"))
                b = db.from_sequence(rebuilt_issues) \
                    .starmap(compress, output_dir=outp_dir) \
                    .starmap(upload, bucket_name=output_bucket_name) \
                    .starmap(cleanup)
                future = b.persist()
                progress(future)

        except Exception as e:
            traceback.print_tb(e.__traceback__)
            print(e)
            if cluster:
                cluster.close()
        finally:
            if cluster:
                cluster.close()

    elif arguments["rebuild_pages"]:
        print("\nFunction not yet implemented (sorry!).\n")
Example #6
0
def main():

    arguments = docopt(__doc__)
    clear_output = arguments["--clear"]
    bucket_name = f's3://{arguments["--input-bucket"]}'
    output_bucket_name = arguments["--output-bucket"]
    outp_dir = arguments["--output-dir"]
    filter_config_file = arguments["--filter-config"]
    output_format = arguments["--format"]
    scheduler = arguments["--scheduler"]
    log_file = arguments["--log-file"]
    log_level = logging.DEBUG if arguments["--verbose"] else logging.INFO
    languages = arguments["--languages"]

    if languages:
        languages = languages.split(',')

    init_logging(log_level, log_file)

    # clean output directory if existing
    if outp_dir is not None and os.path.exists(outp_dir):
        if clear_output is not None and clear_output:
            shutil.rmtree(outp_dir)
            os.mkdir(outp_dir)

    with open(filter_config_file, 'r') as file:
        config = json.load(file)

    # start the dask local cluster
    if scheduler is None:
        client = Client(processes=False, n_workers=8, threads_per_worker=1)
    else:
        client = Client(scheduler)
    logger.info(f"Dask cluster: {client}")

    if arguments["rebuild_articles"]:

        rebuilt_issues = []

        for n, batch in enumerate(config):

            print(f'Processing batch {n + 1}/{len(config)} [{batch}]')
            newspaper = list(batch.keys())[0]
            start_year, end_year = batch[newspaper]

            for year in range(start_year, end_year):
                print(f'Processing year {year}')
                print('Retrieving issues...')
                try:
                    input_issues = read_s3_issues(
                        newspaper,
                        year,
                        bucket_name
                    )
                except FileNotFoundError:
                    print(f'{newspaper}-{year} not found in {bucket_name}')
                    continue

                issue_key, json_files = rebuild_issues(
                    issues=input_issues,
                    input_bucket=bucket_name,
                    output_dir=outp_dir,
                    dask_client=client,
                    format=output_format,
                    filter_language=languages
                )
                rebuilt_issues.append((issue_key, json_files))

        b = db.from_sequence(rebuilt_issues) \
            .starmap(compress, output_dir=outp_dir) \
            .starmap(upload, bucket_name=output_bucket_name) \
            .starmap(cleanup)
        future = b.persist()
        progress(future)

    elif arguments["rebuild_pages"]:
        print("\nFunction not yet implemented (sorry!).\n")