Beispiel #1
0

@task
def harvesters(async=False, start=None, end=None):
    ''' Runs all harvesters '''
    from scrapi import settings
    settings.CELERY_ALWAYS_EAGER = not async
    from scrapi import registry
    from scrapi.tasks import run_harvester
    from dateutil.parser import parse

    start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK)
    end = parse(end).date() if end else date.today()

    exceptions = []
    for harvester_name in registry.keys():
        try:
            run_harvester.delay(harvester_name, start_date=start, end_date=end)
        except Exception as e:
            logger.exception(e)
            exceptions.append(e)

    logger.info("\n\nNumber of exceptions: {}".format(len(exceptions)))
    for exception in exceptions:
        logger.exception(e)


@task
def provider_map(delete=False):
    ''' Adds favicons and metadata for harvesters to Elasticsearch '''
    from six.moves.urllib import parse as urllib_parse
from scrapi import base
from scrapi import registry, requests

logger = logging.getLogger(__name__)


@pytest.fixture(autouse=True)
def mock_maybe_load_response(monkeypatch):
    mock_mlr = mock.Mock()
    mock_mlr.return_value = None
    mock_save = lambda x: x

    monkeypatch.setattr(requests, '_maybe_load_response', mock_mlr)
    monkeypatch.setattr(requests.HarvesterResponse, 'save', mock_save)


@pytest.mark.parametrize('harvester_name', filter(lambda x: x != 'test', sorted(map(str, registry.keys()))))
def test_harvester(monkeypatch, harvester_name, *args, **kwargs):
    monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None)
    base.settings.RAISE_IN_TRANSFORMER = True

    harvester = registry[harvester_name]

    with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'):
        harvested = harvester.harvest()
        assert len(harvested) > 0

    normalized = list(filter(lambda x: x is not None, map(harvester.normalize, harvested[:25])))
    assert len(normalized) > 0
Beispiel #3
0
import vcr
import pytest
from freezegun import freeze_time

from scrapi import base
from scrapi import registry, requests
from scrapi.base.helpers import compose

logger = logging.getLogger(__name__)


@freeze_time("2007-12-21")
@pytest.mark.parametrize('harvester_name',
                         filter(lambda x: x != 'test',
                                sorted(map(str, registry.keys()))))
def test_harvester(monkeypatch, harvester_name, *args, **kwargs):
    monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None)
    base.settings.RAISE_IN_TRANSFORMER = True

    harvester = registry[harvester_name]

    with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name),
                          match_on=['host'],
                          record_mode='none'):
        harvested = harvester.harvest()
        assert len(harvested) > 0

    normalized = filter(lambda x: x is not None,
                        map(harvester.normalize, harvested))
    assert len(normalized) > 0
Beispiel #4
0
def lint_all():
    for name in registry.keys():
        lint(name)
Beispiel #5
0
@task_autoretry(default_retry_delay=settings.CELERY_RETRY_DELAY, max_retries=settings.CELERY_MAX_RETRIES, throws=events.Skip)
@events.logged(events.PROCESSING, 'normalized')
def process_normalized(normalized_doc, raw_doc, **kwargs):
    if not normalized_doc:
        raise events.Skip('Not processing document with id {}'.format(raw_doc['docID']))
    processing.process_normalized(raw_doc, normalized_doc, kwargs)


@app.task
def migrate(migration, source_db=None, sources=tuple(), async=False, dry=True, group_size=1000, **kwargs):

    source_db = source_db or settings.CANONICAL_PROCESSOR
    documents = processing.get_processor(source_db).documents

    doc_sources = sources or registry.keys()
    docs = documents(*doc_sources)
    if async:
        segment = list(islice(docs, group_size))
        while segment:
            migration.s(segment, sources=sources, dry=dry, **kwargs).apply_async()
            segment = list(islice(docs, group_size))
    else:
        for doc in docs:
            migration((doc,), sources=sources, dry=dry, **kwargs)

    if dry:
        logger.info('Dry run complete')

    logger.info('Documents processed for migration {}'.format(str(migration)))
Beispiel #6
0
    return model_iterator


def next_page_old(query, page):
    return list(query.filter(pk__token__gt=Token(page[-1].pk)))


def next_page_source_partition(query, page):
    return list(query.filter(docID__gt=page[-1].docID))


documents_old = ModelIteratorFactory(DocumentModelOld, next_page_old)
documents = ModelIteratorFactory(DocumentModel,
                                 next_page_source_partition,
                                 default_args=registry.keys())


def try_n_times(n, action, *args, **kwargs):
    for _ in xrange(n):
        try:
            return action(*args, **kwargs)
        except Exception as e:
            logger.exception(e)
            time.sleep(15)
            connection_open = setup(force=True, sync=False)
            logger.info("Trying again... Cassandra connection open: {}".format(
                connection_open))
    if e:
        raise e
Beispiel #7
0
import logging

import vcr
import pytest

from scrapi import base
from scrapi import registry, requests

logger = logging.getLogger(__name__)


@pytest.mark.parametrize('harvester_name', filter(lambda x: x != 'test', sorted(map(str, registry.keys()))))
def test_harvester(monkeypatch, harvester_name, *args, **kwargs):
    monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None)
    base.settings.RAISE_IN_TRANSFORMER = True

    harvester = registry[harvester_name]

    with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'):
        harvested = harvester.harvest()
        assert len(harvested) > 0

    normalized = list(filter(lambda x: x is not None, map(harvester.normalize, harvested[:25])))
    assert len(normalized) > 0
Beispiel #8
0

@task
def harvesters(async=False, start=None, end=None):
    ''' Runs all harvesters '''
    from scrapi import settings
    settings.CELERY_ALWAYS_EAGER = not async
    from scrapi import registry
    from scrapi.tasks import run_harvester
    from dateutil.parser import parse

    start = parse(start).date() if start else date.today() - timedelta(settings.DAYS_BACK)
    end = parse(end).date() if end else date.today()

    exceptions = []
    for harvester_name in registry.keys():
        try:
            run_harvester.delay(harvester_name, start_date=start, end_date=end)
        except Exception as e:
            logger.exception(e)
            exceptions.append(e)

    logger.info("\n\nNumber of exceptions: {}".format(len(exceptions)))
    for exception in exceptions:
        logger.exception(e)


@task
def provider_map(delete=False):
    ''' Adds favicons and metadata for harvesters to Elasticsearch '''
    from six.moves.urllib import parse as urllib_parse
Beispiel #9
0
def lint_all():
    from scrapi import registry
    for name in registry.keys():
        lint(name)
Beispiel #10
0
def lint_all():
    for name in registry.keys():
        lint(name)
Beispiel #11
0
            page = try_n_times(5, list, query)
            while len(page) > 0:
                for doc in page:
                    yield doc
                page = try_n_times(5, next_page, query, page)
    return model_iterator


def next_page_old(query, page):
    return list(query.filter(pk__token__gt=Token(page[-1].pk)))


def next_page_source_partition(query, page):
    return list(query.filter(docID__gt=page[-1].docID))

documents_old = ModelIteratorFactory(DocumentModelOld, next_page_old)
documents = ModelIteratorFactory(DocumentModel, next_page_source_partition, default_args=registry.keys())


def try_n_times(n, action, *args, **kwargs):
    for _ in xrange(n):
        try:
            return action(*args, **kwargs)
        except Exception as e:
            logger.exception(e)
            time.sleep(15)
            connection_open = setup(force=True, sync=False)
            logger.info("Trying again... Cassandra connection open: {}".format(connection_open))
    if e:
        raise e
import logging

import vcr
import pytest
from mock import patch
from freezegun import freeze_time

from scrapi import registry, requests

logger = logging.getLogger(__name__)

@freeze_time("2007-12-21")
@pytest.mark.parametrize('harvester_name', sorted(map(str, registry.keys())))
def test_harvester(monkeypatch, harvester_name, *args, **kwargs):
    monkeypatch.setattr(requests.time, 'sleep', lambda *_, **__: None)

    with vcr.use_cassette('tests/vcr/{}.yaml'.format(harvester_name), match_on=['host'], record_mode='none'):
        harvester = registry[harvester_name]
        try:
            normalized = [harvester.normalize(doc) for doc in harvester.harvest()]
        except Exception as e:
            logger.exception(e)
            assert False
    assert len(normalized) > 0
Beispiel #13
0
    processing.process_normalized(raw_doc, normalized_doc, kwargs)


@app.task
def migrate(migration,
            source_db=None,
            sources=tuple(),
            async=False,
            dry=True,
            group_size=1000,
            **kwargs):

    source_db = source_db or settings.CANONICAL_PROCESSOR
    documents = processing.get_processor(source_db).documents

    doc_sources = sources or registry.keys()
    docs = documents(*doc_sources)
    if async:
        segment = list(islice(docs, group_size))
        while segment:
            migration.s(segment,
                        sources=sources,
                        dry=dry,
                        source_db=source_db,
                        **kwargs).apply_async()
            segment = list(islice(docs, group_size))
    else:
        for doc in docs:
            migration((doc, ), sources=sources, dry=dry, **kwargs)

    if dry:
Beispiel #14
0
def lint_all():
    from scrapi import registry
    for name in registry.keys():
        lint(name)