Ejemplo n.º 1
0
def cross_db(docs,
             source_db=None,
             target_db=None,
             index=None,
             versions=False,
             **kwargs):
    """
    Migration to go between
        cassandra > postgres
        postgres > cassandra
        cassandra > elasticsearch
        postgres > elasticsearch

    source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings
    target_db will be specified when the task is called
    """
    assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch'
    assert target_db in [
        'postgres', 'cassandra', 'elasticsearch'
    ], 'Invalid target database - please specify either postgres, cassandra or elasticsearch'
    source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR)
    target_processor = get_processor(target_db)
    for doc in docs:
        try:
            if not doc.raw['doc']:
                # corrupted database item has no doc element
                message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(
                    doc.raw.attributes['source'], doc.raw.attributes['docID'])
                log_to_sentry(message)
                logger.info(message)
                continue

            raw, normalized = doc.raw, doc.normalized

            if not kwargs.get('dry'):
                if versions:
                    for raw_version, norm_version in source_processor.get_versions(
                            raw['source'], raw['docID']):
                        target_processor.process_raw(raw_version)
                        if norm_version:
                            target_processor.process_normalized(
                                raw_version, norm_version)
                        else:
                            logger.info(
                                'Not storing migrated normalized version from {} with id {}, document is not in approved set list.'
                                .format(raw.attributes['source'],
                                        raw.attributes['docID']))
                else:
                    target_processor.process_raw(raw)
                    if normalized:
                        target_processor.process_normalized(raw, normalized)
                    else:
                        logger.info(
                            'Not storing migrated normalized from {} with id {}, document is not in approved set list.'
                            .format(raw.attributes['source'],
                                    raw.attributes['docID']))
        except Exception as e:
            logger.exception(e)
            log_to_sentry(e)
Ejemplo n.º 2
0
def record_or_load_response(method,
                            url,
                            throttle=None,
                            force=False,
                            params=None,
                            expected=(200, ),
                            **kwargs):

    resp = _maybe_load_response(method, url)

    if not force and resp and resp.ok:
        logger.info('Return recorded response from "{}"'.format(url))
        return resp

    if force:
        logger.warning('Force updating request to "{}"'.format(url))
    else:
        logger.info('Making request to "{}"'.format(url))

    maybe_sleep(throttle)

    response = requests.request(method, url, **kwargs)

    if not response.ok:
        events.log_to_sentry('Got non-ok response code.',
                             url=url,
                             method=method)

    if isinstance(response.content, six.text_type):
        response.content = response.content.encode('utf8')

    if not resp:
        return HarvesterResponse(url=url.lower(),
                                 method=method,
                                 ok=response.ok,
                                 content=response.content,
                                 encoding=response.encoding,
                                 status_code=response.status_code,
                                 headers_str=json.dumps(dict(
                                     response.headers))).save()

    logger.warning('Skipped recorded response from "{}"'.format(url))

    return resp.update(ok=(response.ok or response.status_code in expected),
                       content=response.content,
                       encoding=response.encoding,
                       status_code=response.status_code,
                       headers_str=json.dumps(dict(response.headers))).save()
Ejemplo n.º 3
0
def record_or_load_response(method, url, throttle=None, force=False, params=None, **kwargs):
    if params:
        url = furl.furl(url).set(args=params).url

    resp = _maybe_load_response(method, url)

    if not force and resp and resp.ok:
        logger.info('Return recorded response from "{}"'.format(url))
        return resp

    if force:
        logger.warning('Force updating request to "{}"'.format(url))
    else:
        logger.info('Making request to "{}"'.format(url))

    if throttle:
        time.sleep(throttle)

    response = requests.request(method, url, **kwargs)

    if not response.ok:
        events.log_to_sentry('Got non-ok response code.', url=url, method=method)

    if not resp:
        return HarvesterResponse(
            url=url,
            method=method,
            ok=response.ok,
            content=response.content,
            encoding=response.encoding,
            status_code=response.status_code,
            headers_str=json.dumps(dict(response.headers))
        ).save()

    logger.warning('Skipped recorded response from "{}"'.format(url))

    return resp.update(
        ok=response.ok,
        content=response.content,
        encoding=response.encoding,
        status_code=response.status_code,
        headers_str=json.dumps(dict(response.headers))
    ).save()
Ejemplo n.º 4
0
def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs):
    """
    Migration to go between
        cassandra > postgres
        postgres > cassandra
        cassandra > elasticsearch
        postgres > elasticsearch

    source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings
    target_db will be specified when the task is called
    """
    assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch'
    assert target_db in ['postgres', 'cassandra', 'elasticsearch'], 'Invalid target database - please specify either postgres, cassandra or elasticsearch'
    source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR)
    target_processor = get_processor(target_db)
    for doc in docs:
        try:
            if not doc.raw['doc']:
                # corrupted database item has no doc element
                message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(doc.raw.attributes['source'], doc.raw.attributes['docID'])
                log_to_sentry(message)
                logger.info(message)
                continue

            raw, normalized = doc.raw, doc.normalized

            if not kwargs.get('dry'):
                if versions:
                    for raw_version, norm_version in source_processor.get_versions(raw['source'], raw['docID']):
                        target_processor.process_raw(raw_version)
                        if norm_version:
                            target_processor.process_normalized(raw_version, norm_version)
                        else:
                            logger.info('Not storing migrated normalized version from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID']))
                else:
                    target_processor.process_raw(raw)
                    if normalized:
                        target_processor.process_normalized(raw, normalized)
                    else:
                        logger.info('Not storing migrated normalized from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID']))
        except Exception as e:
            logger.exception(e)
            log_to_sentry(e)
Ejemplo n.º 5
0
def record_or_load_response(method, url, throttle=None, force=False, params=None, expected=(200,), **kwargs):

    resp = _maybe_load_response(method, url)

    if not force and resp and resp.ok:
        logger.info('Return recorded response from "{}"'.format(url))
        return resp

    if force:
        logger.warning('Force updating request to "{}"'.format(url))
    else:
        logger.info('Making request to "{}"'.format(url))

    maybe_sleep(throttle)

    response = requests.request(method, url, **kwargs)

    if not response.ok:
        events.log_to_sentry('Got non-ok response code.', url=url, method=method)

    if isinstance(response.content, six.text_type):
        response.content = response.content.encode('utf8')

    if not resp:
        return HarvesterResponse(
            url=url.lower(),
            method=method,
            ok=response.ok,
            content=response.content,
            encoding=response.encoding,
            status_code=response.status_code,
            headers_str=json.dumps(dict(response.headers))
        ).save()

    logger.warning('Skipped recorded response from "{}"'.format(url))

    return resp.update(
        ok=(response.ok or response.status_code in expected),
        content=response.content,
        encoding=response.encoding,
        status_code=response.status_code,
        headers_str=json.dumps(dict(response.headers))
    ).save()