Example #1
0
def test_rebuild_for_passim():
    input_bucket_name = S3_CANONICAL_BUCKET
    outp_dir = pkg_resources.resource_filename('impresso_commons',
                                               'data/rebuilt-passim')

    input_issues = read_s3_issues("luxwort", "1848", input_bucket_name)

    issue_key, json_files = rebuild_issues(issues=input_issues[:50],
                                           input_bucket=input_bucket_name,
                                           output_dir=outp_dir,
                                           dask_client=client,
                                           format='passim',
                                           filter_language=['fr'])
    logger.info(f'{issue_key}: {json_files}')
Example #2
0
def test_rebuild_solr(newspaper_id: str, year: int, limit: int):
    input_bucket_name = S3_CANONICAL_BUCKET
    outp_dir = pkg_resources.resource_filename('impresso_commons',
                                               'data/rebuilt')

    input_issues = read_s3_issues(newspaper_id, year, input_bucket_name)
    print(f'{newspaper_id}/{year}: {len(input_issues)} issues to rebuild')
    print(f'limiting test rebuild to first {limit} issues.')

    issue_key, json_files = rebuild_issues(issues=input_issues[:limit],
                                           input_bucket=input_bucket_name,
                                           output_dir=outp_dir,
                                           dask_client=client,
                                           format='solr')

    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
def test_rebuild_JDG2():
    input_bucket_name = "s3://original-canonical-fixed"
    outp_dir = pkg_resources.resource_filename(
        'impresso_commons',
        'data/rebuilt'
    )
    
    input_issues = read_s3_issues("JDG", "1862", input_bucket_name)
    print(f'{len(input_issues)} issues to rebuild')

    issue_key, json_files = rebuild_issues(
        issues=input_issues,
        input_bucket=input_bucket_name,
        output_dir=outp_dir,
        dask_client=client,
        format='solr'
    )
    
    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
def test_rebuild_indeplux():
    input_bucket_name = "s3://TRANSFER"
    outp_dir = pkg_resources.resource_filename(
        'impresso_commons',
        'data/rebuilt'
    )

    input_issues = read_s3_issues("indeplux", "1905", input_bucket_name)
    print(f'{len(input_issues)} issues to rebuild')

    issue_key, json_files = rebuild_issues(
        issues=input_issues[:50],
        input_bucket=input_bucket_name,
        output_dir=outp_dir,
        dask_client=client,
        format='solr',
        filter_language=['fr']
    )

    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None
def test_rebuild_NZZ():
    input_bucket_name = "original-canonical-data"
    outp_dir = pkg_resources.resource_filename(
        'impresso_commons',
        'data/rebuilt'
    )

    input_issues = impresso_iter_bucket(
        input_bucket_name,
        prefix="NZZ/1784/12/",
        item_type="issue"
    )

    issue_key, json_files = rebuild_issues(
        issues=input_issues,
        input_bucket=input_bucket_name,
        output_dir=outp_dir,
        dask_client=client,
        format='solr'
    )

    result = compress(issue_key, json_files, outp_dir)
    logger.info(result)
    assert result is not None