def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs): """ Migration to go between cassandra > postgres postgres > cassandra cassandra > elasticsearch postgres > elasticsearch source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings target_db will be specified when the task is called """ assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch' assert target_db in [ 'postgres', 'cassandra', 'elasticsearch' ], 'Invalid target database - please specify either postgres, cassandra or elasticsearch' source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR) target_processor = get_processor(target_db) for doc in docs: try: if not doc.raw['doc']: # corrupted database item has no doc element message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format( doc.raw.attributes['source'], doc.raw.attributes['docID']) log_to_sentry(message) logger.info(message) continue raw, normalized = doc.raw, doc.normalized if not kwargs.get('dry'): if versions: for raw_version, norm_version in source_processor.get_versions( raw['source'], raw['docID']): target_processor.process_raw(raw_version) if norm_version: target_processor.process_normalized( raw_version, norm_version) else: logger.info( 'Not storing migrated normalized version from {} with id {}, document is not in approved set list.' .format(raw.attributes['source'], raw.attributes['docID'])) else: target_processor.process_raw(raw) if normalized: target_processor.process_normalized(raw, normalized) else: logger.info( 'Not storing migrated normalized from {} with id {}, document is not in approved set list.' .format(raw.attributes['source'], raw.attributes['docID'])) except Exception as e: logger.exception(e) log_to_sentry(e)
def record_or_load_response(method, url, throttle=None, force=False, params=None, expected=(200, ), **kwargs): resp = _maybe_load_response(method, url) if not force and resp and resp.ok: logger.info('Return recorded response from "{}"'.format(url)) return resp if force: logger.warning('Force updating request to "{}"'.format(url)) else: logger.info('Making request to "{}"'.format(url)) maybe_sleep(throttle) response = requests.request(method, url, **kwargs) if not response.ok: events.log_to_sentry('Got non-ok response code.', url=url, method=method) if isinstance(response.content, six.text_type): response.content = response.content.encode('utf8') if not resp: return HarvesterResponse(url=url.lower(), method=method, ok=response.ok, content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict( response.headers))).save() logger.warning('Skipped recorded response from "{}"'.format(url)) return resp.update(ok=(response.ok or response.status_code in expected), content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict(response.headers))).save()
def record_or_load_response(method, url, throttle=None, force=False, params=None, **kwargs): if params: url = furl.furl(url).set(args=params).url resp = _maybe_load_response(method, url) if not force and resp and resp.ok: logger.info('Return recorded response from "{}"'.format(url)) return resp if force: logger.warning('Force updating request to "{}"'.format(url)) else: logger.info('Making request to "{}"'.format(url)) if throttle: time.sleep(throttle) response = requests.request(method, url, **kwargs) if not response.ok: events.log_to_sentry('Got non-ok response code.', url=url, method=method) if not resp: return HarvesterResponse( url=url, method=method, ok=response.ok, content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict(response.headers)) ).save() logger.warning('Skipped recorded response from "{}"'.format(url)) return resp.update( ok=response.ok, content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict(response.headers)) ).save()
def cross_db(docs, source_db=None, target_db=None, index=None, versions=False, **kwargs): """ Migration to go between cassandra > postgres postgres > cassandra cassandra > elasticsearch postgres > elasticsearch source db can be passed in to the migrate task, and will default to the CANONICAL_PROCESSOR specified in settings target_db will be specified when the task is called """ assert target_db, 'Please specify a target db for the migration -- either postgres or elasticsearch' assert target_db in ['postgres', 'cassandra', 'elasticsearch'], 'Invalid target database - please specify either postgres, cassandra or elasticsearch' source_processor = get_processor(source_db or settings.CANONICAL_PROCESSOR) target_processor = get_processor(target_db) for doc in docs: try: if not doc.raw['doc']: # corrupted database item has no doc element message = 'No doc element in raw doc -- could not migrate document from {} with id {}'.format(doc.raw.attributes['source'], doc.raw.attributes['docID']) log_to_sentry(message) logger.info(message) continue raw, normalized = doc.raw, doc.normalized if not kwargs.get('dry'): if versions: for raw_version, norm_version in source_processor.get_versions(raw['source'], raw['docID']): target_processor.process_raw(raw_version) if norm_version: target_processor.process_normalized(raw_version, norm_version) else: logger.info('Not storing migrated normalized version from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID'])) else: target_processor.process_raw(raw) if normalized: target_processor.process_normalized(raw, normalized) else: logger.info('Not storing migrated normalized from {} with id {}, document is not in approved set list.'.format(raw.attributes['source'], raw.attributes['docID'])) except Exception as e: logger.exception(e) log_to_sentry(e)
def record_or_load_response(method, url, throttle=None, force=False, params=None, expected=(200,), **kwargs): resp = _maybe_load_response(method, url) if not force and resp and resp.ok: logger.info('Return recorded response from "{}"'.format(url)) return resp if force: logger.warning('Force updating request to "{}"'.format(url)) else: logger.info('Making request to "{}"'.format(url)) maybe_sleep(throttle) response = requests.request(method, url, **kwargs) if not response.ok: events.log_to_sentry('Got non-ok response code.', url=url, method=method) if isinstance(response.content, six.text_type): response.content = response.content.encode('utf8') if not resp: return HarvesterResponse( url=url.lower(), method=method, ok=response.ok, content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict(response.headers)) ).save() logger.warning('Skipped recorded response from "{}"'.format(url)) return resp.update( ok=(response.ok or response.status_code in expected), content=response.content, encoding=response.encoding, status_code=response.status_code, headers_str=json.dumps(dict(response.headers)) ).save()