コード例 #1
0
def test_tasks(app, db, halt_workflow, sample_records_uri):
    """Test tasks."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        with pytest.raises(CrawlerJobNotExistError):
            submit_results(job_id,
                           results_uri=sample_records_uri,
                           errors=None,
                           log_file=None)

        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        submit_results(job_id=job_id,
                       results_uri=sample_records_uri,
                       errors=None,
                       log_file="/foo/bar")

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == sample_records_uri

        workflow = WorkflowObject.get(1)
        assert workflow
        extra_data = workflow.extra_data
        assert 'source_data' in extra_data
        assert 'data' in extra_data['source_data']
        assert 'extra_data' in extra_data['source_data']
        expected_extra_data = {
            'crawler_job_id': job_id,
            'crawler_results_path': urlparse(sample_records_uri).path
        }
        assert expected_extra_data == extra_data['source_data']['extra_data']

        with pytest.raises(CrawlerJobError):
            submit_results(job_id,
                           results_uri=sample_records_uri,
                           errors=["Some error"],
                           log_file=None)

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR
コード例 #2
0
def test_tasks(app, db, halt_workflow, sample_record_filename):
    """Test tasks."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)
        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)
        with pytest.raises(CrawlerJobNotExistError):
            submit_results(
                job_id, results_uri=sample_record_filename,
                errors=None, log_file=None
            )

        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)

        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        submit_results(
            job_id=job_id,
            results_uri=sample_record_filename,
            errors=None,
            log_file="/foo/bar"
        )

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == sample_record_filename

        workflow = WorkflowObject.get(1)
        assert workflow
        assert workflow.extra_data['crawler_job_id'] == job_id
        crawler_results_path = workflow.extra_data['crawler_results_path']
        assert crawler_results_path == urlparse(sample_record_filename).path

        with pytest.raises(CrawlerJobError):
            submit_results(
                job_id, results_uri=sample_record_filename,
                errors=["Some error"], log_file=None
            )

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR
コード例 #3
0
def test_submit_results_with_results_data(app, db, halt_workflow,
                                          sample_records_uri, sample_records):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        dummy_records_uri = sample_records_uri + 'idontexist'
        submit_results(
            job_id=job_id,
            results_uri=dummy_records_uri,
            results_data=sample_records,
            errors=None,
            log_file="/foo/bar"
        )

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == dummy_records_uri

        workflow = WorkflowObject.get(1)
        assert workflow
        assert workflow.extra_data['crawler_job_id'] == job_id
        crawler_results_path = workflow.extra_data['crawler_results_path']
        assert crawler_results_path == urlparse(dummy_records_uri).path

        with pytest.raises(CrawlerJobError):
            submit_results(
                job_id,
                results_uri=dummy_records_uri,
                results_data=sample_records,
                errors=["Some error"],
                log_file=None,
            )

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR
コード例 #4
0
def test_receivers(app, db, sample_record_string):
    with requests_mock.Mocker() as requests_mocker:
        job_id = uuid.uuid4().hex

        requests_mocker.register_uri('POST',
                                     'http://localhost:6800/schedule.json',
                                     json={
                                         'jobid': job_id,
                                         'status': 'ok'
                                     })

        mock_record = MagicMock()
        prop_mock = PropertyMock(return_value=sample_record_string)
        type(mock_record).raw = prop_mock

        with app.app_context():
            assert receive_oaiharvest_job(
                request=None, records=[mock_record], name="") is None

            receive_oaiharvest_job(request=None,
                                   records=[mock_record],
                                   name='',
                                   spider='Test',
                                   workflow='test')
            job = CrawlerJob.get_by_job(job_id)

            assert job
コード例 #5
0
ファイル: cli_harvest.py プロジェクト: SCOAP3/scoap3-next
def schedule_and_wait_crawl(max_wait, *args, **kwargs):
    """
    Calls inspire-crawler schedule_task and waits for the created task to finish.

    :return: if the job finished successfully
    """

    job_id = schedule_crawl(*args, **kwargs)
    log('Crawler job scheduled.', job_id=job_id)
    job = CrawlerJob.get_by_job(job_id)

    sleep_time = current_app.config.get('CLI_HARVEST_SLEEP_TIME', 0.5)
    sleep_counter = 0

    while job.status not in (JobStatus.ERROR, JobStatus.FINISHED):
        if sleep_counter * sleep_time > max_wait:
            log('Timeout reached, skip waiting for job.', logging.ERROR, job_id=job_id, job_status=job.status)
            break

        sleep(sleep_time)
        sleep_counter += 1

        db.session.refresh(job)

    if job.status in (JobStatus.ERROR, JobStatus.FINISHED):
        log('Job finished.', job_id=job_id, job_status=job.status)

    return job.status == JobStatus.FINISHED
コード例 #6
0
def test_receivers(app, db, sample_record):
    """Test receivers."""
    job_id = uuid.uuid4().hex
    responses.add(
        responses.POST, "http://localhost:6800/schedule.json",
        body=json.dumps({"jobid": job_id, "status": "ok"}),
        status=200
    )

    mock_record = MagicMock()
    prop_mock = PropertyMock(return_value=sample_record)
    type(mock_record).raw = prop_mock
    with app.app_context():
        assert receive_oaiharvest_job(
            request=None, records=[mock_record], name=""
        ) is None

        receive_oaiharvest_job(
            request=None,
            records=[mock_record],
            name="",
            spider="Test",
            workflow="test"
        )
        job = CrawlerJob.get_by_job(job_id)

        assert job
コード例 #7
0
def schedule_and_wait_crawl(max_wait, *args, **kwargs):
    """
    Calls inspire-crawler schedule_task and waits for the created task to finish.

    :return: if the job finished successfully
    """

    job_id = schedule_crawl(*args, **kwargs)
    log('Crawler job scheduled.', job_id=job_id)
    job = CrawlerJob.get_by_job(job_id)

    sleep_time = current_app.config.get('CLI_HARVEST_SLEEP_TIME', 0.5)
    sleep_counter = 0

    while job.status not in (JobStatus.ERROR, JobStatus.FINISHED):
        if sleep_counter * sleep_time > max_wait:
            log('Timeout reached, skip waiting for job.',
                logging.ERROR,
                job_id=job_id,
                job_status=job.status)
            break

        sleep(sleep_time)
        sleep_counter += 1

        db.session.refresh(job)

    if job.status in (JobStatus.ERROR, JobStatus.FINISHED):
        log('Job finished.', job_id=job_id, job_status=job.status)

    return job.status == JobStatus.FINISHED
コード例 #8
0
def test_create_workflow_for_faulty_data(app, db, halt_workflow):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = [{
            'error': 'ValueError',
            'traceback': 'There was a ValueError',
            'xml_record': 'Just an XML string'
        }]
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=test_data,
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)
        assert workflow.status == ObjectStatus.ERROR
コード例 #9
0
def test_create_workflow_for_faulty_data(app, db, halt_workflow):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = {
            'errors': [{
                'exception': 'ValueError',
                'traceback': 'ValueError on the line 23.'
            }],
            'source_data':
            'Just an XML string',
            'record': {},
            'file_name':
            'broken.xml'
        }
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=[test_data],
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)

        expected_crawl_error = {
            'errors': [{
                'exception': 'ValueError',
                'traceback': 'ValueError on the line 23.'
            }],
            'source_data':
            'Just an XML string',
            'file_name':
            'broken.xml'
        }

        assert workflow.status == ObjectStatus.ERROR
        assert workflow.data == test_data['record']
        assert workflow.extra_data['crawl_errors'] == expected_crawl_error
コード例 #10
0
def test_create_error_workflow_for_wrong_crawl_result(app, db, halt_workflow):
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = {
            'source_data': 'Just an XML string',
            'record': {},
            # missing 'errors' and 'file_name'
        }
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=[test_data],
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)

        expected = {
            'errors': [{
                'exception':
                'KeyError',
                'traceback':
                'Wrong crawl result format. '
                'Missing the key `errors`'
            }],
            'file_name':
            None,
            'source_data': {
                'record': {},
                'source_data': 'Just an XML string'
            },
        }

        assert workflow.status == ObjectStatus.ERROR
        assert workflow.data == {}
        assert workflow.extra_data['crawl_errors'] == expected