Exemple #1
0
    def from_json(cls, json, workflow_id=None):
        """
        Constructor for a holdingpen entry, it will be able to be mapped to and
        from json, and used to fully edit entries. Usually you pass to it the
        full raw json from the details of a holdingpen entry.

        Args:
            json(dict): dictionary of a single entry as returned by the api.
        """
        if workflow_id is None:
            workflow_id = json['id']

        extra_data = json.get('_extra_data', {})
        data_type = json['_workflow']['data_type']

        if data_type == 'hep':
            hp_entry = HoldingpenLiteratureResource(
                workflow_id=workflow_id,
                approved=extra_data.get('approved'),
                auto_approved=extra_data.get('auto-approved'),
                is_update=extra_data.get('is-update'),
                core=extra_data.get('core'),
                status=json['_workflow']['status'],
                titles=[
                    LiteratureResourceTitle.from_json(title)
                    for title in json['metadata']['titles']
                ],
                control_number=json['metadata'].get('control_number'),
                arxiv_eprint=json['metadata'].get('arxiv_eprints',
                                                  [{}])[0].get('value'),
                doi=json['metadata'].get('dois', [{}])[0].get('value'),
                approved_match=extra_data.get('matches', {}).get('approved'),
            )
        elif data_type == 'authors':
            hp_entry = HoldingpenAuthorResource(
                workflow_id=workflow_id,
                approved=extra_data.get('approved'),
                is_update=extra_data.get('is-update'),
                core=extra_data.get('core'),
                status=json['_workflow']['status'],
                control_number=json['metadata'].get('control_number'),
                display_name=json['metadata']['name']['preferred_name'],
            )
        else:
            raise ValueError(
                'Unsupported holdingpen resource type "{}"'.format(data_type))

        hp_entry._raw_json = json
        return hp_entry
    def from_json(cls, json, workflow_id=None):
        """
        Constructor for a holdingpen entry, it will be able to be mapped to and
        from json, and used to fully edit entries. Usually you pass to it the
        full raw json from the details of a holdingpen entry.

        Args:
            json(dict): dictionary of a single entry as returned by the api.
        """
        if workflow_id is None:
            workflow_id = json['id']

        extra_data = json.get('_extra_data', {})
        data_type = json['_workflow']['data_type']

        if data_type == 'hep':
            hp_entry = HoldingpenLiteratureResource(
                workflow_id=workflow_id,
                approved=extra_data.get('approved'),
                auto_approved=extra_data.get('auto-approved'),
                is_update=extra_data.get('is-update'),
                core=extra_data.get('core'),
                status=json['_workflow']['status'],
                titles=[LiteratureResourceTitle.from_json(title) for title in json['metadata']['titles']],
                control_number=json['metadata'].get('control_number'),
                arxiv_eprint=json['metadata'].get('arxiv_eprints', [{}])[0].get('value'),
                doi=json['metadata'].get('dois', [{}])[0].get('value'),
                approved_match=extra_data.get('matches', {}).get('approved'),
            )
        elif data_type == 'authors':
            hp_entry = HoldingpenAuthorResource(
                workflow_id=workflow_id,
                approved=extra_data.get('approved'),
                is_update=extra_data.get('is-update'),
                core=extra_data.get('core'),
                status=json['_workflow']['status'],
                control_number=json['metadata'].get('control_number'),
                display_name=json['metadata']['name']['preferred_name'],
            )
        else:
            raise ValueError('Unsupported holdingpen resource type "{}"'.format(data_type))

        hp_entry._raw_json = json
        return hp_entry
Exemple #3
0
def test_harvest_non_core_article_goes_in(inspire_client, mitm_client):
    inspire_client.e2e.schedule_crawl(
        spider='arXiv',
        workflow='article',
        url='http://export.arxiv.org/oai2',
        sets='physics',
        from_date='2018-03-25',
    )

    completed_entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='COMPLETED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(
        completed_entry.workflow_id)

    # check workflow goes as expected
    assert entry.approved is True
    assert entry.arxiv_eprint == '1404.0579'
    assert entry.control_number
    assert entry.core is None
    assert entry.doi == '10.1016/j.nima.2014.04.029'
    assert entry.status == 'COMPLETED'
    expected_titles = [
        LiteratureResourceTitle(
            title='The OLYMPUS Internal Hydrogen Target',
            source='arXiv',
        )
    ]
    assert entry.titles == expected_titles

    # check literature record is available and consistent
    record = inspire_client.literature.get_record(entry.control_number)
    assert entry.titles == record.titles

    # check that the external services were actually called
    mitm_client.assert_interaction_used(
        service_name='LegacyService',
        interaction_name='robotupload',
        times=1,
    )
Exemple #4
0
def test_harvest_nucl_th_and_jlab_curation(inspire_client, mitm_client):
    inspire_client.e2e.schedule_crawl(
        spider='arXiv_single',
        workflow='article',
        url='http://export.arxiv.org/oai2',
        identifier='oai:arXiv.org:1806.05669',  # nucl-th record
    )

    completed_entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='COMPLETED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(
        completed_entry.workflow_id)

    assert entry.arxiv_eprint == '1806.05669'
    assert entry.control_number is 42
    expected_titles = [
        LiteratureResourceTitle(
            title=
            'Probing the in-Medium QCD Force by Open Heavy-Flavor Observables',
            source='arXiv',
        )
    ]
    assert entry.titles == expected_titles

    # check literature record is available and consistent
    record = inspire_client.literature.get_record(entry.control_number)
    assert record.titles == entry.titles

    # check that the external services were actually called
    mitm_client.assert_interaction_used(
        service_name='LegacyService',
        interaction_name='robotupload',
        times=1,
    )
    mitm_client.assert_interaction_used(
        service_name='RTService',
        interaction_name='ticket_new',
        times=1,
    )

    def _get_ticket_content():
        curr_path = os.path.dirname(__file__)
        ticket_file = os.path.join(curr_path, 'scenarios',
                                   'harvest_nucl_th_and_jlab_curation',
                                   'RTService', 'ticket_new.yaml')
        content = yaml.load(open(ticket_file))
        ticket_content = urllib2.unquote(
            content['request']['body']).decode('utf8')
        return re.search('\/workflows\/edit_article\/[0-9]+',
                         ticket_content).group(0)

    curation_link = _get_ticket_content()
    assert inspire_client._client.get(curation_link).status_code == 200

    new_entries = wait_for(
        lambda: _number_of_entries(inspire_client.holdingpen, 2))
    assert len(new_entries) == 2
    edit_article_wf = filter(lambda entry: entry.status == 'WAITING',
                             new_entries)[0]

    entry = inspire_client.holdingpen.get_detail_entry(
        edit_article_wf.workflow_id)

    def apply_changes_to_wf():
        new_title = 'Title changed by JLab curator'
        curated_content = entry._raw_json
        curated_content['metadata']['titles'][0]['title'] = new_title
        return HoldingpenResource.from_json(curated_content)

    entry = apply_changes_to_wf()
    inspire_client.holdingpen.resume(entry)

    entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='COMPLETED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(entry.workflow_id)

    time.sleep(5)
    # check literature record is available and consistent
    record = inspire_client.literature.get_record(entry.control_number)
    expected_titles = [
        LiteratureResourceTitle(
            title='Title changed by JLab curator',
            source='arXiv',
        )
    ]
    assert record.titles == expected_titles
Exemple #5
0
def test_harvest_core_article_manual_accept_goes_in(inspire_client,
                                                    mitm_client):
    inspire_client.e2e.schedule_crawl(
        spider='arXiv',
        workflow='article',
        url='http://export.arxiv.org/oai2',
        sets='q-bio',
        from_date='2018-03-25',
    )

    halted_entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='HALTED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(
        halted_entry.workflow_id)

    # check workflow gets halted
    assert entry.approved is None
    assert entry.arxiv_eprint == '1806.05312'
    assert entry.control_number is None
    assert entry.core is None
    assert entry.doi == '10.1063/PT.3.3947'
    assert entry.status == 'HALTED'
    expected_titles = [
        LiteratureResourceTitle(source='arXiv',
                                title='The turbulent formation of stars')
    ]
    assert entry.titles == expected_titles

    inspire_client.holdingpen.accept_core(holdingpen_id=entry.workflow_id)

    # check that completed workflow is ok
    completed_entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='COMPLETED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(
        completed_entry.workflow_id)

    assert entry.arxiv_eprint == '1806.05312'
    assert entry.control_number == 42
    assert entry.doi == '10.1063/PT.3.3947'
    assert entry.titles == expected_titles

    # check literature record is available and consistent
    record = inspire_client.literature.get_record(entry.control_number)
    assert record.titles == entry.titles

    # check that the external services were actually called
    mitm_client.assert_interaction_used(
        service_name='LegacyService',
        interaction_name='robotupload',
        times=1,
    )
    mitm_client.assert_interaction_used(
        service_name='RTService',
        interaction_name='ticket_new',
        times=1,
    )
Exemple #6
0
def test_harvest_core_article_goes_in(inspire_client, mitm_client):
    inspire_client.e2e.schedule_crawl(
        spider='arXiv',
        workflow='article',
        url='http://export.arxiv.org/oai2',
        sets='physics',
        from_date='2018-03-25',
    )

    completed_entry = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        status='COMPLETED',
        num_entries=1,
    ))[0]
    entry = inspire_client.holdingpen.get_detail_entry(
        completed_entry.workflow_id)

    # check workflow goes as expected
    assert entry.approved is True
    assert entry.arxiv_eprint == '1412.0200'
    assert entry.control_number
    assert entry.core
    assert entry.status == 'COMPLETED'
    expected_titles = [
        LiteratureResourceTitle(
            source='arXiv',
            title=
            'BRST-BFV Lagrangian Formulations for Higher Spin Fields subject to two-column Young Tableaux'
        )
    ]
    assert entry.titles == expected_titles
    assert not entry.is_update

    # check literature record is available and consistent
    record = inspire_client.literature.get_record(entry.control_number)
    assert record.titles == entry.titles

    # check that the external services were actually called
    mitm_client.assert_interaction_used(
        service_name='LegacyService',
        interaction_name='robotupload',
        times=1,
    )
    mitm_client.assert_interaction_used(
        service_name='RTService',
        interaction_name='ticket_new',
        times=1,
    )

    # update
    inspire_client.e2e.schedule_crawl(
        spider='arXiv',
        workflow='article',
        url='http://export.arxiv.org/oai2',
        sets='physics',
        from_date='2018-03-26',
    )

    completed_entries = wait_for(lambda: _workflows_in_status(
        holdingpen_client=inspire_client.holdingpen,
        num_entries=2,
        status='COMPLETED'))
    update_entry = max(completed_entries, key=lambda entry: entry.workflow_id)
    update_entry = inspire_client.holdingpen.get_detail_entry(
        update_entry.workflow_id)

    # check workflow goes as expected
    # due to the merge, the titles get extended
    expected_titles = [
        LiteratureResourceTitle(
            title=
            'BRST-BFV Lagrangian Formulations for Higher Spin Fields subject to two-column Young Tableaux updated',
            source='arXiv',
        ),
        LiteratureResourceTitle(
            title=
            'BRST-BFV Lagrangian Formulations for Higher Spin Fields subject to two-column Young Tableaux',
            source='arXiv',
        ),
    ]
    assert update_entry.auto_approved is True
    assert update_entry.arxiv_eprint == '1412.0200'
    assert update_entry.core
    assert update_entry.status == 'COMPLETED'
    assert update_entry.titles == expected_titles
    assert update_entry.is_update
    assert update_entry.approved_match == entry.control_number

    # check literature record is available and consistent
    record = inspire_client.literature.get_record(update_entry.control_number)
    assert record.titles == update_entry.titles

    # check that the external services were actually called, the updates flag
    # is disabled
    mitm_client.assert_interaction_used(
        service_name='LegacyService',
        interaction_name='robotupload',
        times=1,
    )
    mitm_client.assert_interaction_used(
        service_name='RTService',
        interaction_name='ticket_new',
        times=1,
    )