@task def harvesters(async=False, start=None, end=None): ''' Runs all harvesters ''' from scrapi import settings settings.CELERY_ALWAYS_EAGER = not async from scrapi import registry from scrapi.tasks import run_harvester from dateutil.parser import parse start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK) end = parse(end).date() if end else date.today() exceptions = [] for harvester_name in registry.keys(): try: run_harvester.delay(harvester_name, start_date=start, end_date=end) except Exception as e: logger.exception(e) exceptions.append(e) logger.info("\n\nNumber of exceptions: {}".format(len(exceptions))) for exception in exceptions: logger.exception(e) @task def provider_map(delete=False): ''' Adds favicons and metadata for harvesters to Elasticsearch ''' from six.moves.urllib import parse as urllib_parse
from scrapi import base from scrapi import registry, requests logger = logging.getLogger(__name__) @pytest.fixture(autouse=True) def mock_maybe_load_response(monkeypatch): mock_mlr = mock.Mock() mock_mlr.return_value = None mock_save = lambda x: x monkeypatch.setattr(requests, '_maybe_load_response', mock_mlr) monkeypatch.setattr(requests.HarvesterResponse, 'save', mock_save) @pytest.mark.parametrize('harvester_name', filter(lambda x: x != 'test', sorted(map(str, registry.keys())))) def test_harvester(monkeypatch, harvester_name, *args, **kwargs): monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None) base.settings.RAISE_IN_TRANSFORMER = True harvester = registry[harvester_name] with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'): harvested = harvester.harvest() assert len(harvested) > 0 normalized = list(filter(lambda x: x is not None, map(harvester.normalize, harvested[:25]))) assert len(normalized) > 0
import vcr import pytest from freezegun import freeze_time from scrapi import base from scrapi import registry, requests from scrapi.base.helpers import compose logger = logging.getLogger(__name__) @freeze_time("2007-12-21") @pytest.mark.parametrize('harvester_name', filter(lambda x: x != 'test', sorted(map(str, registry.keys())))) def test_harvester(monkeypatch, harvester_name, *args, **kwargs): monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None) base.settings.RAISE_IN_TRANSFORMER = True harvester = registry[harvester_name] with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'): harvested = harvester.harvest() assert len(harvested) > 0 normalized = filter(lambda x: x is not None, map(harvester.normalize, harvested)) assert len(normalized) > 0
def lint_all(): for name in registry.keys(): lint(name)
@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=settings.CELERY_MAX_RETRIES, throws=events.Skip) @events.logged(events.PROCESSING, 'normalized') def process_normalized(normalized_doc, raw_doc, **kwargs): if not normalized_doc: raise events.Skip('Not processing document with id {}'.format(raw_doc['docID'])) processing.process_normalized(raw_doc, normalized_doc, kwargs) @app.task def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs): source_db = source_db or settings.CANONICAL_PROCESSOR documents = processing.get_processor(source_db).documents doc_sources = sources or registry.keys() docs = documents(*doc_sources) if async: segment = list(islice(docs, group_size)) while segment: migration.s(segment, sources=sources, dry=dry, **kwargs).apply_async() segment = list(islice(docs, group_size)) else: for doc in docs: migration((doc,), sources=sources, dry=dry, **kwargs) if dry: logger.info('Dry run complete') logger.info('Documents processed for migration {}'.format(str(migration)))
return model_iterator def next_page_old(query, page): return list(query.filter(pk__token__gt=Token(page[-1].pk))) def next_page_source_partition(query, page): return list(query.filter(docID__gt=page[-1].docID)) documents_old = ModelIteratorFactory(DocumentModelOld, next_page_old) documents = ModelIteratorFactory(DocumentModel, next_page_source_partition, default_args=registry.keys()) def try_n_times(n, action, *args, **kwargs): for _ in xrange(n): try: return action(*args, **kwargs) except Exception as e: logger.exception(e) time.sleep(15) connection_open = setup(force=True, sync=False) logger.info("Trying again... Cassandra connection open: {}".format( connection_open)) if e: raise e
import logging import vcr import pytest from scrapi import base from scrapi import registry, requests logger = logging.getLogger(__name__) @pytest.mark.parametrize('harvester_name', filter(lambda x: x != 'test', sorted(map(str, registry.keys())))) def test_harvester(monkeypatch, harvester_name, *args, **kwargs): monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None) base.settings.RAISE_IN_TRANSFORMER = True harvester = registry[harvester_name] with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'): harvested = harvester.harvest() assert len(harvested) > 0 normalized = list(filter(lambda x: x is not None, map(harvester.normalize, harvested[:25]))) assert len(normalized) > 0
def lint_all(): from scrapi import registry for name in registry.keys(): lint(name)
page = try_n_times(5, list, query) while len(page) > 0: for doc in page: yield doc page = try_n_times(5, next_page, query, page) return model_iterator def next_page_old(query, page): return list(query.filter(pk__token__gt=Token(page[-1].pk))) def next_page_source_partition(query, page): return list(query.filter(docID__gt=page[-1].docID)) documents_old = ModelIteratorFactory(DocumentModelOld, next_page_old) documents = ModelIteratorFactory(DocumentModel, next_page_source_partition, default_args=registry.keys()) def try_n_times(n, action, *args, **kwargs): for _ in xrange(n): try: return action(*args, **kwargs) except Exception as e: logger.exception(e) time.sleep(15) connection_open = setup(force=True, sync=False) logger.info("Trying again... Cassandra connection open: {}".format(connection_open)) if e: raise e
import logging import vcr import pytest from mock import patch from freezegun import freeze_time from scrapi import registry, requests logger = logging.getLogger(__name__) @freeze_time("2007-12-21") @pytest.mark.parametrize('harvester_name', sorted(map(str, registry.keys()))) def test_harvester(monkeypatch, harvester_name, *args, **kwargs): monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None) with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'): harvester = registry[harvester_name] try: normalized = [harvester.normalize(doc) for doc in harvester.harvest()] except Exception as e: logger.exception(e) assert False assert len(normalized) > 0
processing.process_normalized(raw_doc, normalized_doc, kwargs) @app.task def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs): source_db = source_db or settings.CANONICAL_PROCESSOR documents = processing.get_processor(source_db).documents doc_sources = sources or registry.keys() docs = documents(*doc_sources) if async: segment = list(islice(docs, group_size)) while segment: migration.s(segment, sources=sources, dry=dry, source_db=source_db, **kwargs).apply_async() segment = list(islice(docs, group_size)) else: for doc in docs: migration((doc, ), sources=sources, dry=dry, **kwargs) if dry: