Beispiel #1
0
def test_create_workflow_for_faulty_data(app, db, halt_workflow):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = [{
            'error': 'ValueError',
            'traceback': 'There was a ValueError',
            'xml_record': 'Just an XML string'
        }]
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=test_data,
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)
        assert workflow.status == ObjectStatus.ERROR
Beispiel #2
0
def test_tasks(app, db, halt_workflow, sample_records_uri):
    """Test tasks."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        with pytest.raises(CrawlerJobNotExistError):
            submit_results(job_id,
                           results_uri=sample_records_uri,
                           errors=None,
                           log_file=None)

        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        submit_results(job_id=job_id,
                       results_uri=sample_records_uri,
                       errors=None,
                       log_file="/foo/bar")

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == sample_records_uri

        workflow = WorkflowObject.get(1)
        assert workflow
        extra_data = workflow.extra_data
        assert 'source_data' in extra_data
        assert 'data' in extra_data['source_data']
        assert 'extra_data' in extra_data['source_data']
        expected_extra_data = {
            'crawler_job_id': job_id,
            'crawler_results_path': urlparse(sample_records_uri).path
        }
        assert expected_extra_data == extra_data['source_data']['extra_data']

        with pytest.raises(CrawlerJobError):
            submit_results(job_id,
                           results_uri=sample_records_uri,
                           errors=["Some error"],
                           log_file=None)

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR
def test_tasks(app, db, halt_workflow, sample_record_filename):
    """Test tasks."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)
        with pytest.raises(CrawlerInvalidResultsPath):
            submit_results(job_id, results_uri="", errors=None, log_file=None)
        with pytest.raises(CrawlerJobNotExistError):
            submit_results(
                job_id, results_uri=sample_record_filename,
                errors=None, log_file=None
            )

        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)

        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        submit_results(
            job_id=job_id,
            results_uri=sample_record_filename,
            errors=None,
            log_file="/foo/bar"
        )

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == sample_record_filename

        workflow = WorkflowObject.get(1)
        assert workflow
        assert workflow.extra_data['crawler_job_id'] == job_id
        crawler_results_path = workflow.extra_data['crawler_results_path']
        assert crawler_results_path == urlparse(sample_record_filename).path

        with pytest.raises(CrawlerJobError):
            submit_results(
                job_id, results_uri=sample_record_filename,
                errors=["Some error"], log_file=None
            )

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR
Beispiel #4
0
def test_create_workflow_for_faulty_data(app, db, halt_workflow):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = {
            'errors': [{
                'exception': 'ValueError',
                'traceback': 'ValueError on the line 23.'
            }],
            'source_data':
            'Just an XML string',
            'record': {},
            'file_name':
            'broken.xml'
        }
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=[test_data],
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)

        expected_crawl_error = {
            'errors': [{
                'exception': 'ValueError',
                'traceback': 'ValueError on the line 23.'
            }],
            'source_data':
            'Just an XML string',
            'file_name':
            'broken.xml'
        }

        assert workflow.status == ObjectStatus.ERROR
        assert workflow.data == test_data['record']
        assert workflow.extra_data['crawl_errors'] == expected_crawl_error
Beispiel #5
0
def test_create_error_workflow_for_wrong_crawl_result(app, db, halt_workflow):
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="desy",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        test_data = {
            'source_data': 'Just an XML string',
            'record': {},
            # missing 'errors' and 'file_name'
        }
        submit_results(job_id=job_id,
                       results_uri='idontexist',
                       results_data=[test_data],
                       errors=None,
                       log_file="/foo/bar")
        workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \
            .one().object_id
        workflow = WorkflowObject.get(workflow_id)

        expected = {
            'errors': [{
                'exception':
                'KeyError',
                'traceback':
                'Wrong crawl result format. '
                'Missing the key `errors`'
            }],
            'file_name':
            None,
            'source_data': {
                'record': {},
                'source_data': 'Just an XML string'
            },
        }

        assert workflow.status == ObjectStatus.ERROR
        assert workflow.data == {}
        assert workflow.extra_data['crawl_errors'] == expected
Beispiel #6
0
def test_submit_results_with_results_data(app, db, halt_workflow,
                                          sample_records_uri, sample_records):
    """Test submit_results passing the data as payload."""
    job_id = uuid.uuid4().hex  # init random value
    with app.app_context():
        CrawlerJob.create(
            job_id=job_id,
            spider="Test",
            workflow=halt_workflow.__name__,
            logs=None,
            results=None,
        )
        db.session.commit()

    with app.app_context():
        job = CrawlerJob.get_by_job(job_id)
        assert job
        assert str(job.status)
        assert job.status == JobStatus.PENDING

        dummy_records_uri = sample_records_uri + 'idontexist'
        submit_results(
            job_id=job_id,
            results_uri=dummy_records_uri,
            results_data=sample_records,
            errors=None,
            log_file="/foo/bar"
        )

        job = CrawlerJob.get_by_job(job_id)
        assert job.logs == "/foo/bar"
        assert job.results == dummy_records_uri

        workflow = WorkflowObject.get(1)
        assert workflow
        assert workflow.extra_data['crawler_job_id'] == job_id
        crawler_results_path = workflow.extra_data['crawler_results_path']
        assert crawler_results_path == urlparse(dummy_records_uri).path

        with pytest.raises(CrawlerJobError):
            submit_results(
                job_id,
                results_uri=dummy_records_uri,
                results_data=sample_records,
                errors=["Some error"],
                log_file=None,
            )

        job = CrawlerJob.get_by_job(job_id)
        assert job.status == JobStatus.ERROR