def test_tasks(app, db, halt_workflow, sample_records_uri): """Test tasks.""" job_id = uuid.uuid4().hex # init random value with app.app_context(): with pytest.raises(CrawlerJobNotExistError): submit_results(job_id, results_uri=sample_records_uri, errors=None, log_file=None) CrawlerJob.create( job_id=job_id, spider="Test", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with pytest.raises(CrawlerInvalidResultsPath): submit_results(job_id, results_uri="", errors=None, log_file=None) with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING submit_results(job_id=job_id, results_uri=sample_records_uri, errors=None, log_file="/foo/bar") job = CrawlerJob.get_by_job(job_id) assert job.logs == "/foo/bar" assert job.results == sample_records_uri workflow = WorkflowObject.get(1) assert workflow extra_data = workflow.extra_data assert 'source_data' in extra_data assert 'data' in extra_data['source_data'] assert 'extra_data' in extra_data['source_data'] expected_extra_data = { 'crawler_job_id': job_id, 'crawler_results_path': urlparse(sample_records_uri).path } assert expected_extra_data == extra_data['source_data']['extra_data'] with pytest.raises(CrawlerJobError): submit_results(job_id, results_uri=sample_records_uri, errors=["Some error"], log_file=None) job = CrawlerJob.get_by_job(job_id) assert job.status == JobStatus.ERROR
def test_tasks(app, db, halt_workflow, sample_record_filename): """Test tasks.""" job_id = uuid.uuid4().hex # init random value with app.app_context(): with pytest.raises(CrawlerInvalidResultsPath): submit_results(job_id, results_uri="", errors=None, log_file=None) with pytest.raises(CrawlerInvalidResultsPath): submit_results(job_id, results_uri="", errors=None, log_file=None) with pytest.raises(CrawlerJobNotExistError): submit_results( job_id, results_uri=sample_record_filename, errors=None, log_file=None ) CrawlerJob.create( job_id=job_id, spider="Test", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING submit_results( job_id=job_id, results_uri=sample_record_filename, errors=None, log_file="/foo/bar" ) job = CrawlerJob.get_by_job(job_id) assert job.logs == "/foo/bar" assert job.results == sample_record_filename workflow = WorkflowObject.get(1) assert workflow assert workflow.extra_data['crawler_job_id'] == job_id crawler_results_path = workflow.extra_data['crawler_results_path'] assert crawler_results_path == urlparse(sample_record_filename).path with pytest.raises(CrawlerJobError): submit_results( job_id, results_uri=sample_record_filename, errors=["Some error"], log_file=None ) job = CrawlerJob.get_by_job(job_id) assert job.status == JobStatus.ERROR
def test_submit_results_with_results_data(app, db, halt_workflow, sample_records_uri, sample_records): """Test submit_results passing the data as payload.""" job_id = uuid.uuid4().hex # init random value with app.app_context(): CrawlerJob.create( job_id=job_id, spider="Test", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING dummy_records_uri = sample_records_uri + 'idontexist' submit_results( job_id=job_id, results_uri=dummy_records_uri, results_data=sample_records, errors=None, log_file="/foo/bar" ) job = CrawlerJob.get_by_job(job_id) assert job.logs == "/foo/bar" assert job.results == dummy_records_uri workflow = WorkflowObject.get(1) assert workflow assert workflow.extra_data['crawler_job_id'] == job_id crawler_results_path = workflow.extra_data['crawler_results_path'] assert crawler_results_path == urlparse(dummy_records_uri).path with pytest.raises(CrawlerJobError): submit_results( job_id, results_uri=dummy_records_uri, results_data=sample_records, errors=["Some error"], log_file=None, ) job = CrawlerJob.get_by_job(job_id) assert job.status == JobStatus.ERROR
def test_receivers(app, db, sample_record_string): with requests_mock.Mocker() as requests_mocker: job_id = uuid.uuid4().hex requests_mocker.register_uri('POST', 'http://localhost:6800/schedule.json', json={ 'jobid': job_id, 'status': 'ok' }) mock_record = MagicMock() prop_mock = PropertyMock(return_value=sample_record_string) type(mock_record).raw = prop_mock with app.app_context(): assert receive_oaiharvest_job( request=None, records=[mock_record], name="") is None receive_oaiharvest_job(request=None, records=[mock_record], name='', spider='Test', workflow='test') job = CrawlerJob.get_by_job(job_id) assert job
def schedule_and_wait_crawl(max_wait, *args, **kwargs): """ Calls inspire-crawler schedule_task and waits for the created task to finish. :return: if the job finished successfully """ job_id = schedule_crawl(*args, **kwargs) log('Crawler job scheduled.', job_id=job_id) job = CrawlerJob.get_by_job(job_id) sleep_time = current_app.config.get('CLI_HARVEST_SLEEP_TIME', 0.5) sleep_counter = 0 while job.status not in (JobStatus.ERROR, JobStatus.FINISHED): if sleep_counter * sleep_time > max_wait: log('Timeout reached, skip waiting for job.', logging.ERROR, job_id=job_id, job_status=job.status) break sleep(sleep_time) sleep_counter += 1 db.session.refresh(job) if job.status in (JobStatus.ERROR, JobStatus.FINISHED): log('Job finished.', job_id=job_id, job_status=job.status) return job.status == JobStatus.FINISHED
def test_receivers(app, db, sample_record): """Test receivers.""" job_id = uuid.uuid4().hex responses.add( responses.POST, "http://localhost:6800/schedule.json", body=json.dumps({"jobid": job_id, "status": "ok"}), status=200 ) mock_record = MagicMock() prop_mock = PropertyMock(return_value=sample_record) type(mock_record).raw = prop_mock with app.app_context(): assert receive_oaiharvest_job( request=None, records=[mock_record], name="" ) is None receive_oaiharvest_job( request=None, records=[mock_record], name="", spider="Test", workflow="test" ) job = CrawlerJob.get_by_job(job_id) assert job
def schedule_and_wait_crawl(max_wait, *args, **kwargs): """ Calls inspire-crawler schedule_task and waits for the created task to finish. :return: if the job finished successfully """ job_id = schedule_crawl(*args, **kwargs) log('Crawler job scheduled.', job_id=job_id) job = CrawlerJob.get_by_job(job_id) sleep_time = current_app.config.get('CLI_HARVEST_SLEEP_TIME', 0.5) sleep_counter = 0 while job.status not in (JobStatus.ERROR, JobStatus.FINISHED): if sleep_counter * sleep_time > max_wait: log('Timeout reached, skip waiting for job.', logging.ERROR, job_id=job_id, job_status=job.status) break sleep(sleep_time) sleep_counter += 1 db.session.refresh(job) if job.status in (JobStatus.ERROR, JobStatus.FINISHED): log('Job finished.', job_id=job_id, job_status=job.status) return job.status == JobStatus.FINISHED
def test_create_workflow_for_faulty_data(app, db, halt_workflow): """Test submit_results passing the data as payload.""" job_id = uuid.uuid4().hex # init random value with app.app_context(): CrawlerJob.create( job_id=job_id, spider="desy", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING test_data = [{ 'error': 'ValueError', 'traceback': 'There was a ValueError', 'xml_record': 'Just an XML string' }] submit_results(job_id=job_id, results_uri='idontexist', results_data=test_data, errors=None, log_file="/foo/bar") workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \ .one().object_id workflow = WorkflowObject.get(workflow_id) assert workflow.status == ObjectStatus.ERROR
def test_create_workflow_for_faulty_data(app, db, halt_workflow): """Test submit_results passing the data as payload.""" job_id = uuid.uuid4().hex # init random value with app.app_context(): CrawlerJob.create( job_id=job_id, spider="desy", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING test_data = { 'errors': [{ 'exception': 'ValueError', 'traceback': 'ValueError on the line 23.' }], 'source_data': 'Just an XML string', 'record': {}, 'file_name': 'broken.xml' } submit_results(job_id=job_id, results_uri='idontexist', results_data=[test_data], errors=None, log_file="/foo/bar") workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \ .one().object_id workflow = WorkflowObject.get(workflow_id) expected_crawl_error = { 'errors': [{ 'exception': 'ValueError', 'traceback': 'ValueError on the line 23.' }], 'source_data': 'Just an XML string', 'file_name': 'broken.xml' } assert workflow.status == ObjectStatus.ERROR assert workflow.data == test_data['record'] assert workflow.extra_data['crawl_errors'] == expected_crawl_error
def test_create_error_workflow_for_wrong_crawl_result(app, db, halt_workflow): job_id = uuid.uuid4().hex # init random value with app.app_context(): CrawlerJob.create( job_id=job_id, spider="desy", workflow=halt_workflow.__name__, logs=None, results=None, ) db.session.commit() with app.app_context(): job = CrawlerJob.get_by_job(job_id) assert job assert str(job.status) assert job.status == JobStatus.PENDING test_data = { 'source_data': 'Just an XML string', 'record': {}, # missing 'errors' and 'file_name' } submit_results(job_id=job_id, results_uri='idontexist', results_data=[test_data], errors=None, log_file="/foo/bar") workflow_id = CrawlerWorkflowObject.query.filter_by(job_id=job_id) \ .one().object_id workflow = WorkflowObject.get(workflow_id) expected = { 'errors': [{ 'exception': 'KeyError', 'traceback': 'Wrong crawl result format. ' 'Missing the key `errors`' }], 'file_name': None, 'source_data': { 'record': {}, 'source_data': 'Just an XML string' }, } assert workflow.status == ObjectStatus.ERROR assert workflow.data == {} assert workflow.extra_data['crawl_errors'] == expected