def setup_pipeline(source_definition): extractor = load_object(source_definition['extractor'])(source_definition) transformer = load_object(source_definition['transformer'])() loader = load_object(source_definition['loader'])() for item in extractor.run(): (transformer.s(*item, source_definition=source_definition) | loader.s(source_definition=source_definition)).delay()
def transform_item(self, raw_item_content_type, raw_item, item, class_name=False): if not class_name: class_name = item.xpath("local-name()") if class_name in self.source_definition['mapping']: item_source = self.source_definition['mapping'][class_name] item_class = item_source['item'] else: log.info('Skipping %s, does not exist in mapping' % class_name) return [] items = list() if 'sub_items' in item_source: for key, path in item_source['sub_items'].items(): for sub_item in item.xpath(path): items += self.transform_item(raw_item_content_type, etree.tostring(sub_item), sub_item, class_name=key) item_class = load_object(item_class) item = item_class(self.source_definition, raw_item_content_type, raw_item, item, unicode(item_source['doc_type'])) self.add_resolveable_media_urls(item) return [( item.get_combined_object_id(), item.get_object_id(), item.get_combined_index_doc(), item.get_index_doc(), item.doc_type )] + items
def transform_item(self, raw_item_content_type, raw_item, item, class_name=False): if not class_name: class_name = item.xpath("local-name()") if class_name in self.source_definition['mapping']: item_source = self.source_definition['mapping'][class_name] item_class = item_source['item'] else: log.info('Skipping %s, does not exist in mapping' % class_name) return [] items = list() if 'sub_items' in item_source: for key, path in item_source['sub_items'].items(): for sub_item in item.xpath(path): items += self.transform_item(raw_item_content_type, etree.tostring(sub_item), sub_item, class_name=key) item_class = load_object(item_class) item = item_class(self.source_definition, raw_item_content_type, raw_item, item, unicode(item_source['doc_type'])) self.add_resolveable_media_urls(item) return [(item.get_combined_object_id(), item.get_object_id(), item.get_combined_index_doc(), item.get_index_doc(), item.doc_type)] + items
def get_model_class(properties): """ Finds the "type" property in the list of properties and imports the model class of that name. """ for _property in properties: if _property['predicate'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': try: return load_object('ocd_backend.models.%s' % _property['value']) except ImportError: raise ImportError('Unable to import class "ocd_backend.models.%s"' % _property['value']) raise ValueError('Unable to get model class: Object contains no "type" property.')
def _init_extractor_from_source(self, source_name): """ Initializes an extractor from a specified source name. """ try: source = [s for s in self.sources if s['id'] == source_name][0] except IndexError as e: source = None if source is None: return extractor_klass = load_object(source['extractor']) return extractor_klass(source)
def run(self, *args, **kwargs): """Start transformation of a single item. This method is called by the extractor and expects args to contain the content-type and the original item (as a string). Kwargs should contain the ``source_definition`` dict. :returns: the output of :py:meth:`~BaseTransformer.transform_item` """ self.source_definition = kwargs['source_definition'] self.item_class = load_object(kwargs['source_definition']['item']) self.run_node = kwargs.get('run_node') item = self.deserialize_item(*args) # pylint: disable=no-value-for-parameter return self.transform_item(*args, item=item) # pylint: disable=no-value-for-parameter
def run(self, *args, **kwargs): """Start tranformation of a single item. This method is called by the extractor and expects args to contain the content-type and the original item (as a string). Kwargs should contain the ``source_definition`` dict. :type raw_item_content_type: string :param raw_item_content_type: the content-type of the data retrieved from the source (e.g. ``application/json``) :type raw_item: string :param raw_item: the data in it's original format, as retrieved from the source (as a string) :param source_definition: The configuration of a single source in the form of a dictionary (as defined in the settings). :type source_definition: dict. :returns: the output of :py:meth:`~BaseTransformer.transform_item` """ self.source_definition = kwargs['source_definition'] self.item_class = load_object(kwargs['source_definition']['item']) item = self.deserialize_item(*args) return self.transform_item(*args, item=item)
def run(self, *args, **kwargs): """Start transformation of a single item. This method is called by the extractor and expects args to contain the content-type and the original item (as a string). Kwargs should contain the ``source_definition`` dict. :type raw_item_content_type: string :param raw_item_content_type: the content-type of the data retrieved from the source (e.g. ``application/json``) :type raw_item: string :param raw_item: the data in it's original format, as retrieved from the source (as a string) :param source_definition: The configuration of a single source in the form of a dictionary (as defined in the settings). :type source_definition: dict. :returns: the output of :py:meth:`~BaseTransformer.transform_item` """ self.source_definition = kwargs['source_definition'] self.item_class = load_object(kwargs['source_definition']['item']) item = self.deserialize_item(*args) return self.transform_item(*args, item=item)
def _load_source(source): for phase in ['extractor', 'transformer', 'item', 'loader']: module = load_object(source[phase])
def cleanup(self, **kwargs): cleanup_task = load_object(self.source_definition.get('cleanup'))() cleanup_task.delay(**kwargs)
def cleanup(self, **kwargs): cleanup_task = load_object(self.source_definition.get("cleanup"))() cleanup_task.delay(**kwargs)
def setup_pipeline(source_definition): # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=settings.DEFAULT_INDEX_PREFIX, index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) extractor = load_object(source_definition['extractor'])(source_definition) transformer = load_object(source_definition['transformer'])() enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in source_definition['enrichers']] loader = load_object(source_definition['loader'])() # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias } celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) try: for item in extractor.run(): # Generate an identifier for each chain, and record that in # {}_chains, so that we can know for sure when all tasks # from an extractor have finished params['chain_id'] = uuid4().hex celery_app.backend.add_value_to_set(set_name=run_identifier_chains, value=params['chain_id']) item_chain = chain() # Tranform item_chain |= transformer.s( *item, source_definition=source_definition, **params ) # Enrich for enricher_task, enricher_settings in enrichers: item_chain |= enricher_task.s( source_definition=source_definition, enricher_settings=enricher_settings, **params ) # Load item_chain |= loader.s( source_definition=source_definition, **params ) item_chain.delay() except: logger.error('An exception has occured in the "{extractor}" extractor. ' 'Deleting index "{index}" and setting status of run ' 'identifier "{run_identifier}" to "error".' .format(index=new_index_name, run_identifier=params['run_identifier'], extractor=source_definition['extractor'])) celery_app.backend.set(params['run_identifier'], 'error') raise celery_app.backend.set(params['run_identifier'], 'done')
def setup_pipeline(source_definition): logger.info('Starting pipeline for source: %s' % source_definition.get('id')) # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=settings.DEFAULT_INDEX_PREFIX, index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] # Check if the source specifies that any update should be added to # the current index instead of a new one if source_definition.get('keep_index_on_update'): new_index_name = current_index_name else: new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias } logger.debug('Starting run with identifier %s' % params['run_identifier']) #run = Run(RunIdentifier, params['run_identifier'], 'ori') #run.save() #params['run_node'] = run celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) # we can have multiple pipelines. but for compatibility and readability # use the source definition if no specific pipelines have been defined pipelines = source_definition.get('pipelines', None) or [source_definition] pipeline_definitions = {} pipeline_extractors = {} pipeline_extensions = {} pipeline_transformers = {} pipeline_enrichers = {} pipeline_loaders = {} for pipeline in pipelines: if 'id' not in pipeline: raise ConfigurationError("Each pipeline must have an id field.") # adjusted source definitionsv per pipeline. This way you can for # example change the index on a pipeline basis pipeline_definitions[pipeline['id']] = deepcopy(source_definition) pipeline_definitions[pipeline['id']].update(pipeline) # initialize the ETL classes, per pipeline pipeline_extractors[pipeline['id']] = load_object( pipeline_definitions[pipeline['id']]['extractor']) pipeline_extensions[pipeline['id']] = [ load_object(cls) for cls in pipeline_definitions[pipeline['id']].get('extensions', [])] if pipeline.get('transformer'): pipeline_transformers[pipeline['id']] = load_object( pipeline['transformer'])() pipeline_enrichers[pipeline['id']] = [ (load_object(enricher[0])(), enricher[1] or {}) for enricher in pipeline_definitions[pipeline['id']].get('enrichers', [])] pipeline_loaders[pipeline['id']] = list() for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \ [pipeline_definitions[pipeline['id']].get('loader', None)]: if cls: pipeline_loaders[pipeline['id']].append(load_object(cls)()) result = None for pipeline in pipelines: try: # The first extractor should be a generator instead of a task for item in pipeline_extractors[pipeline['id']]( source_definition=pipeline_definitions[pipeline['id']]).run(): step_chain = list() params['chain_id'] = uuid4().hex celery_app.backend.add_value_to_set( set_name=run_identifier_chains, value=params['chain_id']) # Remaining extractors for extension in pipeline_extensions[pipeline['id']]: step_chain.append(extension().s( *item, source_definition=pipeline_definitions[pipeline['id']], **params ) ) # Prevent old item being passed down to next steps item = [] # Transformers if pipeline_transformers.get(pipeline['id']): step_chain.append(pipeline_transformers[pipeline['id']].s( *item, source_definition=pipeline_definitions[pipeline['id']], **params) ) # Enrichers for enricher_task, enricher_settings in pipeline_enrichers[ pipeline['id'] ]: step_chain.append(enricher_task.s( source_definition=pipeline_definitions[ pipeline['id']], enricher_settings=enricher_settings, **params ) ) # Loaders # Multiple loaders to enable to save to different stores initialized_loaders = [] for loader in pipeline_loaders[pipeline['id']]: initialized_loaders.append(loader.s( source_definition=pipeline_definitions[ pipeline['id']], **params)) step_chain.append(group(initialized_loaders)) result = chain(step_chain).delay() except KeyboardInterrupt: logger.warning('KeyboardInterrupt received. Stopping the program.') exit() except Exception, e: logger.error('An exception has occured in the "{extractor}" extractor.' ' Setting status of run identifier "{run_identifier}" to ' '"error":\n{message}' .format(index=params['new_index_name'], run_identifier=params['run_identifier'], extractor=pipeline_extractors[pipeline['id']], message=e, ) ) celery_app.backend.set(params['run_identifier'], 'error') raise
def setup_pipeline(source_definition): logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id'))) # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX), index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] # Check if the source specifies that any update should be added to # the current index instead of a new one if source_definition.get('keep_index_on_update'): new_index_name = current_index_name else: new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias, } logger.debug('[%s] Starting run with identifier %s' % (source_definition['key'], params['run_identifier'])) celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) # we can have multiple pipelines. but for compatibility and readability # use the source definition if no specific pipelines have been defined pipelines = source_definition.get('pipelines', None) or [source_definition] pipeline_definitions = {} pipeline_extractors = {} pipeline_transformers = {} pipeline_enrichers = {} pipeline_loaders = {} for pipeline in pipelines: if 'id' not in pipeline: raise ConfigurationError("Each pipeline must have an id field.") # adjusted source definitions per pipeline. This way you can for # example change the index on a pipeline basis pipeline_definitions[pipeline['id']] = deepcopy(source_definition) pipeline_definitions[pipeline['id']].update(pipeline) # initialize the ETL classes, per pipeline pipeline_extractors[pipeline['id']] = load_object( pipeline_definitions[pipeline['id']]['extractor']) pipeline_transformers[pipeline['id']] = load_object( pipeline_definitions[pipeline['id']]['transformer']) pipeline_enrichers[pipeline['id']] = [ (load_object(enricher) or {}) for enricher in pipeline_definitions[pipeline['id']].get('enrichers', [])] pipeline_loaders[pipeline['id']] = list() for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \ [pipeline_definitions[pipeline['id']].get('loader', None)]: if cls: pipeline_loaders[pipeline['id']].append(load_object(cls)) result = None for pipeline in pipelines: try: # The first extractor should be a generator instead of a task for item in pipeline_extractors[pipeline['id']]( source_definition=pipeline_definitions[pipeline['id']]).run(): step_chain = list() params['chain_id'] = uuid4().hex params['start_time'] = datetime.now() celery_app.backend.add_value_to_set( set_name=run_identifier_chains, value=params['chain_id']) # Transformers if pipeline_transformers.get(pipeline['id']): step_chain.append(pipeline_transformers[pipeline['id']].s( *item, source_definition=pipeline_definitions[pipeline['id']], **params) ) # Enrichers for enricher_task in pipeline_enrichers[ pipeline['id'] ]: step_chain.append(enricher_task.s( source_definition=pipeline_definitions[ pipeline['id']], **params ) ) # Loaders # Multiple loaders to enable to save to different stores initialized_loaders = [] for loader in pipeline_loaders[pipeline['id']]: initialized_loaders.append(loader.s( source_definition=pipeline_definitions[ pipeline['id']], **params)) step_chain.append(group(initialized_loaders)) result = chain(step_chain).delay() except KeyboardInterrupt: logger.warning('KeyboardInterrupt received. Stopping the program.') exit() except Exception, e: logger.error('[{site_name}] Pipeline has failed. Setting status of ' 'run identifier "{run_identifier}" to "error":\n{message}' .format(index=params['new_index_name'], run_identifier=params['run_identifier'], extractor=pipeline_extractors[pipeline['id']], message=e, site_name=source_definition['key'], ) ) celery_app.backend.set(params['run_identifier'], 'error') # Reraise the exception so celery can autoretry raise
def setup_pipeline(source_definition): # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=settings.DEFAULT_INDEX_PREFIX, index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] # Check if the source specifies that any update should be added to # the current index instead of a new one if source_definition['keep_index_on_update']: new_index_name = current_index_name else: new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) extractor = load_object(source_definition['extractor'])(source_definition) transformer = load_object(source_definition['transformer'])() enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in source_definition['enrichers']] loader = load_object(source_definition['loader'])() # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias } celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) try: for item in extractor.run(): # Generate an identifier for each chain, and record that in # {}_chains, so that we can know for sure when all tasks # from an extractor have finished params['chain_id'] = uuid4().hex celery_app.backend.add_value_to_set(set_name=run_identifier_chains, value=params['chain_id']) item_chain = chain() # Tranform item_chain |= transformer.s( *item, source_definition=source_definition, **params ) # Enrich for enricher_task, enricher_settings in enrichers: item_chain |= enricher_task.s( source_definition=source_definition, enricher_settings=enricher_settings, **params ) # Load item_chain |= loader.s( source_definition=source_definition, **params ) item_chain.delay() except: logger.error('An exception has occured in the "{extractor}" extractor. ' 'Setting status of run identifier "{run_identifier}" to ' '"error".' .format(index=new_index_name, run_identifier=params['run_identifier'], extractor=source_definition['extractor'])) celery_app.backend.set(params['run_identifier'], 'error') raise celery_app.backend.set(params['run_identifier'], 'done')
def setup_pipeline(source_definition): current_date_and_time = datetime.utcnow() # first thhe secondary index index_alias = initialize_index(source_definition, current_date_and_time) current_index_name = get_current_index(index_alias) new_index_name = get_new_index(source_definition, current_index_name, index_alias, current_date_and_time) # now the combined index index_names = source_definition.get('index_names', ['combined_index']) current_index_names = [] new_index_names = [] additional_aliases = [] for cur_index in index_names: additional_source_definition = { 'id': 'test_source', 'index_name': cur_index, 'keep_index_on_update': source_definition['keep_index_on_update'] } additional_alias = '%s_%s' % ( settings.DEFAULT_INDEX_PREFIX, cur_index, ) additional_aliases.append(additional_alias) initialize_index(additional_source_definition, current_date_and_time) current_additional_index_name = get_current_index(additional_alias) new_additional_index_name = get_new_index( additional_source_definition, current_additional_index_name, additional_alias, current_date_and_time) current_index_names.append(current_additional_index_name) new_index_names.append(new_additional_index_name) print "Setting up alias for %s (-> %s)" % ( cur_index, new_additional_index_name, ) # now load objects and prepare the run ... extractor = load_object(source_definition['extractor'])(source_definition) transformer = load_object(source_definition['transformer'])() enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in source_definition['enrichers']] loader = load_object(source_definition['loader'])() # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias, 'index_aliases': additional_aliases, 'current_index_names': current_index_names, 'new_index_names': new_index_names } celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) try: for item in extractor.run(): # Generate an identifier for each chain, and record that in # {}_chains, so that we can know for sure when all tasks # from an extractor have finished params['chain_id'] = uuid4().hex celery_app.backend.add_value_to_set(set_name=run_identifier_chains, value=params['chain_id']) item_chain = chain() # Tranform item_chain |= transformer.s(*item, source_definition=source_definition, **params) # Enrich for enricher_task, enricher_settings in enrichers: item_chain |= enricher_task.s( source_definition=source_definition, enricher_settings=enricher_settings, **params) # Load item_chain |= loader.s(source_definition=source_definition, **params) item_chain.delay() except: logger.error( 'An exception has occured in the "{extractor}" extractor. ' 'Setting status of run identifier "{run_identifier}" to ' '"error".'.format(index=new_index_name, run_identifier=params['run_identifier'], extractor=source_definition['extractor'])) celery_app.backend.set(params['run_identifier'], 'error') raise celery_app.backend.set(params['run_identifier'], 'done')