Example #1
0
def oup(**kwargs):
    """
    Harvests OUP packages.

    OUP delivers articles in multiple zip files, so *one package* consists of four zip archives:
      1) containing pdfa full text files ('_archival.zip' postfix)
      2) containing images ('.img.zip' postfix)
      3) containing pdf full text files ('.pdf.zip' postfix)
      4) containing xml metadata files ('.xml.zip' postfix)
    These archives can contain information/files about more then one article.

    Processing the files of a package in a proper order is crucial: when a non-xml file is parsed, it will only be
    unzipped to the corresponding directory. However when the xml file is parsed a workflow will be scheduled, which
    will be expecting the files to be in the correct place.

    Since the alphabetical sort does not provide this order, the given files in a folder will be grouped together
    by removing the postfix from the filename.

    The packages will be processed in alphabetical order. To make sure the newest article is available for each article
    after the process, ensure that the chronological and alphabetical order of the packages are equivalent. E.g. have
    the date and time of delivery in the beginning of the filename.

    A new job will only be submitted if the previously submitted job finished (i.e. workflow(s) were created based on
    the Scrapy output). Without this, Scrapy could delete the output files before they got processed, if the workflow
    creation is slower then scraping the packages.

    The following examples demonstrate the expected folder structure and file naming.

    Examples 1 (next):
        In case of having the following files:
        harvest/
            - 20190331195346-ptep_iss_2019_3.img.zip
            - 20190331195346-ptep_iss_2019_3.pdf.zip
            - 20190331195346-ptep_iss_2019_3.xml.zip
            - 20190331195346-ptep_iss_2019_3_archival.zip
            - 20190228130556-ptep_iss_2019_2.img.zip
            - 20190228130556-ptep_iss_2019_2.pdf.zip
            - 20190228130556-ptep_iss_2019_2.xml.zip
            - 20190228130556-ptep_iss_2019_2_archival.zip

        the following command can be used:
        `scoap3 harvest oup --source_folder harvest`

    Examples 2 (legacy):
        In case of having the following files:
        harvest/
            - 2019-03-30_16:30:41_ptep_iss_2019_3.img.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3.pdf.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3_archival.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3.xml.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.img.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.pdf.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6_archival.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.xml.zip

        the following command can be used:
        `scoap3 harvest oup --source_folder harvest`
    """

    package_prefix = 'file://'
    source_folder = kwargs.pop('source_folder')
    max_wait = kwargs.pop('max_wait') or current_app.config.get('CLI_HARVEST_MAX_WAIT_TIME', 60)

    # validate path parameters, collect packages
    packages = []
    source = abspath(source_folder)
    if isdir(source):
        packages = get_files(source)
    else:
        log('Source folder does not exist', logging.ERROR)

    if not packages:
        log('No packages, abort.', logging.ERROR)
        return

    # group collected packages
    log('grouping files...')
    grouped_packages = {}
    for package in packages:
        if not package.endswith('.zip'):
            log('package should be a .zip file. Skipping.', package_path=package)
            continue

        group_key = clean_oup_package_name(package)
        if group_key not in grouped_packages:
            grouped_packages[group_key] = {'files': []}

        if package.endswith('.xml.zip') or package.endswith('.xml_v1.zip'):
            grouped_packages[group_key]['xml'] = package
        else:
            grouped_packages[group_key]['files'].append(package)

    log('sorting grouped files...')
    sorted_grouped_packages = sorted(grouped_packages.items())

    # schedule crawls on the sorted packages
    for group_key, package in sorted_grouped_packages:
        if 'xml' not in package:
            log('No xml file, skipping package.', logging.ERROR, package=package, group_key=group_key)
            continue

        for f in package.get('files', []) + [package['xml']]:
            log('scheduling...', package_path=f)
            if not retry_schedule_and_wait_crawl(max_wait, 'OUP', package_path=package_prefix + f, **kwargs):
                log('package failed.', logging.ERROR, path=f)
Example #2
0
def test_not_registered():
    path = '/some/path/2019-03-30_16:30:41_ptep_iss_2019_3.magical.extension'
    assert clean_oup_package_name(path) == '2019-03-30_16:30:41_ptep_iss_2019_3.magical.extension'
Example #3
0
def test_abs_path2():
    path = '/harvest/oup/2019-03-30_16:30:41_ptep_iss_2019_3_archival.zip'
    assert clean_oup_package_name(path) == '2019-03-30_16:30:41_ptep_iss_2019_3'
Example #4
0
def test_none():
    assert clean_oup_package_name(None) == ''
Example #5
0
def test_empty():
    assert clean_oup_package_name('') == ''
Example #6
0
def test_only_file():
    path = '2019-03-30_16:30:41_ptep_iss_2019_3.xml.zip'
    assert clean_oup_package_name(path) == '2019-03-30_16:30:41_ptep_iss_2019_3'
Example #7
0
def test_rel_path3():
    path = 'oup/2019-03-30_16:30:41_ptep_iss_2019_3.xml.zip'
    assert clean_oup_package_name(path) == '2019-03-30_16:30:41_ptep_iss_2019_3'
Example #8
0
def oup(**kwargs):
    """
    Harvests OUP packages.

    OUP delivers articles in multiple zip files, so *one package* consists of four zip archives:
      1) containing pdfa full text files ('_archival.zip' postfix)
      2) containing images ('.img.zip' postfix)
      3) containing pdf full text files ('.pdf.zip' postfix)
      4) containing xml metadata files ('.xml.zip' postfix)
    These archives can contain information/files about more then one article.

    Processing the files of a package in a proper order is crucial: when a non-xml file is parsed, it will only be
    unzipped to the corresponding directory. However when the xml file is parsed a workflow will be scheduled, which
    will be expecting the files to be in the correct place.

    Since the alphabetical sort does not provide this order, the given files in a folder will be grouped together
    by removing the postfix from the filename.

    The packages will be processed in alphabetical order. To make sure the newest article is available for each article
    after the process, ensure that the chronological and alphabetical order of the packages are equivalent. E.g. have
    the date and time of delivery in the beginning of the filename.

    A new job will only be submitted if the previously submitted job finished (i.e. workflow(s) were created based on
    the Scrapy output). Without this, Scrapy could delete the output files before they got processed, if the workflow
    creation is slower then scraping the packages.

    The following examples demonstrate the expected folder structure and file naming.

    Examples 1 (next):
        In case of having the following files:
        harvest/
            - 20190331195346-ptep_iss_2019_3.img.zip
            - 20190331195346-ptep_iss_2019_3.pdf.zip
            - 20190331195346-ptep_iss_2019_3.xml.zip
            - 20190331195346-ptep_iss_2019_3_archival.zip
            - 20190228130556-ptep_iss_2019_2.img.zip
            - 20190228130556-ptep_iss_2019_2.pdf.zip
            - 20190228130556-ptep_iss_2019_2.xml.zip
            - 20190228130556-ptep_iss_2019_2_archival.zip

        the following command can be used:
        `scoap3 harvest oup --source_folder harvest`

    Examples 2 (legacy):
        In case of having the following files:
        harvest/
            - 2019-03-30_16:30:41_ptep_iss_2019_3.img.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3.pdf.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3_archival.zip
            - 2019-03-30_16:30:41_ptep_iss_2019_3.xml.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.img.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.pdf.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6_archival.zip
            - 2019-01-30_15:30:31_ptep_iss_2018_6.xml.zip

        the following command can be used:
        `scoap3 harvest oup --source_folder harvest`
    """

    package_prefix = 'file://'
    source_folder = kwargs.pop('source_folder')
    max_wait = kwargs.pop('max_wait') or current_app.config.get(
        'CLI_HARVEST_MAX_WAIT_TIME', 60)

    # validate path parameters, collect packages
    packages = []
    source = abspath(source_folder)
    if isdir(source):
        packages = get_files(source)
    else:
        log('Source folder does not exist', logging.ERROR)

    if not packages:
        log('No packages, abort.', logging.ERROR)
        return

    # group collected packages
    log('grouping files...')
    grouped_packages = {}
    for package in packages:
        if not package.endswith('.zip'):
            log('package should be a .zip file. Skipping.',
                package_path=package)
            continue

        group_key = clean_oup_package_name(package)
        if group_key not in grouped_packages:
            grouped_packages[group_key] = {'files': []}

        if package.endswith('.xml.zip') or package.endswith('.xml_v1.zip'):
            grouped_packages[group_key]['xml'] = package
        else:
            grouped_packages[group_key]['files'].append(package)

    log('sorting grouped files...')
    sorted_grouped_packages = sorted(grouped_packages.items())

    # schedule crawls on the sorted packages
    for group_key, package in sorted_grouped_packages:
        if 'xml' not in package:
            log('No xml file, skipping package.',
                logging.ERROR,
                package=package,
                group_key=group_key)
            continue

        for f in package.get('files', []) + [package['xml']]:
            log('scheduling...', package_path=f)
            if not retry_schedule_and_wait_crawl(
                    max_wait, 'OUP', package_path=package_prefix + f, **
                    kwargs):
                log('package failed.', logging.ERROR, path=f)