Exemple #1
0
def acta_cpc(source_file, source_folder):
    """
    Harvests Acta Physica Polonica B or Chinese Physics C packages.

    Passed files should contain pushed metadata in xml format.
    Exactly one of the source_file and source_folder parameters should be present.

    If a folder is passed, all files within the folder will be parsed and processed. The files will always be processed
    in alphabetical order, so in case an article is available in multiple files, make sure that the alphabetical and
    chronological orders are equivalent. Ignoring this can result in having an older version of an article.
    """

    entries = get_packages_for_file_or_folder(source_file, source_folder)

    if not entries:
        log('No entries, abort.', logging.ERROR)
        return

    # harvesting all packages found in source folder
    # sorting like this will result in a chronological order, because the filename contains
    # the date and time of delivery.
    entries_count = len(entries)
    log('Processing packages...', entry_count=entries_count)
    for i, path in enumerate(sorted(entries)):
        if not isfile(path):
            log('Path not a file. Skipping.', path=path)
            continue

        log('processing package', path=path, current_index=i, entry_count=entries_count)
        with open(path, 'rt') as f:
            file_data = f.read()
            obj = parse_received_package(file_data, path)

            create_from_json({'records': [obj]}, apply_async=False)
Exemple #2
0
def test_delete_halted_workflows():
    record = {'dois': [{'value': 'test/doi'}]}

    workflow_id = create_from_json({'records': [record]}, apply_async=False)
    workflow = Workflow.query.get(workflow_id)

    assert workflow.status == WorkflowStatus.HALTED

    workflow_id2 = create_from_json({'records': [record]}, apply_async=False)
    workflow2 = Workflow.query.get(workflow_id2)

    assert workflow_id != workflow_id2
    assert workflow2.status == WorkflowStatus.HALTED
    assert Workflow.query.get(workflow_id) is None
def run_workflow(input_json_filename):
    """Use input_json_filename to load hepcrawl response and to run article_upload workflow."""

    file_path = path.join(get_response_dir(), 'hepcrawl', input_json_filename)
    with open(file_path, 'rt') as f:
        json_data = json.loads(f.read())

    workflow_id = create_from_json({'records': [json_data]},
                                   apply_async=False)[0]
    return Workflow.query.get(workflow_id)
Exemple #4
0
def acta_cpc(source_file, source_folder):
    """
    Harvests Acta Physica Polonica B or Chinese Physics C packages.

    Passed files should contain pushed metadata in xml format.
    Exactly one of the source_file and source_folder parameters should be present.

    If a folder is passed, all files within the folder will be parsed and processed. The files will always be processed
    in alphabetical order, so in case an article is available in multiple files, make sure that the alphabetical and
    chronological orders are equivalent. Ignoring this can result in having an older version of an article.
    """

    entries = get_packages_for_file_or_folder(source_file, source_folder)

    if not entries:
        log('No entries, abort.', logging.ERROR)
        return

    # harvesting all packages found in source folder
    # sorting like this will result in a chronological order, because the filename contains
    # the date and time of delivery.
    entries_count = len(entries)
    log('Processing packages...', entry_count=entries_count)
    for i, path in enumerate(sorted(entries)):
        if not isfile(path):
            log('Path not a file. Skipping.', path=path)
            continue

        log('processing package',
            path=path,
            current_index=i,
            entry_count=entries_count)
        with open(path, 'rt') as f:
            file_data = f.read()
            obj = parse_received_package(file_data, path)

            create_from_json({'records': [obj]}, apply_async=False)
def run_workflow_with_data(input_json_data, mock_address):
    """Runs article_upload workflow with input_json_data.
    Returns the Workflow object."""

    with patch('scoap3.modules.workflows.workflows.articles_upload.__halt_and_notify', mock_halt):
        mock_address.register_uri('GET', '/schemas/hep.json', content=read_hep_schema())
        mock_address.register_uri('GET', '/schemas/elements/titles.json', content=read_titles_schema())
        mock_address.register_uri(
            requests_mock.ANY,
            re.compile('.*(indexer).*'),
            real_http=True,
        )
        workflow_id = create_from_json({'records': [input_json_data]}, apply_async=False)[0]

    return Workflow.query.get(workflow_id)
Exemple #6
0
def run_article_upload_with_data(input_json_data, mock_address):
    """Runs article_upload workflow with input_json_data.
    Returns the Workflow object."""

    with patch(
            'scoap3.modules.workflows.workflows.articles_upload.__halt_and_notify',
            mock_halt):
        mock_address.register_uri('GET',
                                  '/schemas/hep.json',
                                  content=read_hep_schema())
        mock_address.register_uri('GET',
                                  '/schemas/elements/titles.json',
                                  content=read_titles_schema())
        mock_address.register_uri(
            requests_mock.ANY,
            re.compile('.*(indexer|localhost).*'),
            real_http=True,
        )
        workflow_id = create_from_json({'records': [input_json_data]},
                                       apply_async=False)[0]

    return Workflow.query.get(workflow_id)
Exemple #7
0
def handle_upload_request(apply_async=True):
    """Handle articles that are pushed from publishers."""
    remote_addr = request.environ['REMOTE_ADDR']

    logger.info('Robotupload request received. remote_addr=%s headers=%s args=%s data=%s' % (
        remote_addr, request.headers, request.args, request.data[:100]))
    validate_request(remote_addr)

    package_name = 'robotupload_%s_%s' % (datetime.now().isoformat(), remote_addr)

    logger.info('Package delivered. package_name=%s' % package_name)

    # save delivered package
    file_data = request.data
    save_package(package_name, file_data)

    obj = parse_received_package(file_data, package_name)

    journal_title = obj['publication_info'][0]['journal_title']
    check_permission_for_journal(journal_title, remote_addr, package_name)

    return create_from_json({'records': [obj]}, apply_async=apply_async)
Exemple #8
0
def handle_upload_request(apply_async=True):
    """Handle articles that are pushed from publishers."""
    remote_addr = request.environ['REMOTE_ADDR']
    validate_request(remote_addr)

    uploaded_file = request.files['file']
    filename = secure_filename(uploaded_file.filename)
    file_data = uploaded_file.read()

    logger.info('Package delivered with name %s from %s.' %
                (filename, remote_addr))

    # save delivered package
    delivery_time = datetime.now().isoformat()
    package_name = '_'.join((delivery_time, filename, remote_addr))
    save_package(package_name, file_data)

    obj = parse_received_package(file_data, package_name)

    journal_title = obj['publication_info'][0]['journal_title']
    check_permission_for_journal(journal_title, remote_addr, package_name)

    return create_from_json({'records': [obj]}, apply_async=apply_async)
def test_article_upload():
    with open('scoap3/data/scoap3demodata_short.json') as source:
        records = json.loads(source.read())
        create_from_json(records, apply_async=False)

    assert (Compliance.query.count() == 1)
Exemple #10
0
def loadrecords(source):
    """Load records migration dump."""
    records = json.loads(source.read())
    create_from_json(records)
Exemple #11
0
def loadrecords(source):
    """Load records migration dump."""
    records = json.loads(source.read())
    create_from_json(records)