Python load_object Examples, ocd_backend.utils.misc.load_object Python Examples

Example #1

0

Show file

File: pipeline.py Project: ajslaghu/open-cultuur-data

def setup_pipeline(source_definition):
    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    loader = load_object(source_definition['loader'])()

    for item in extractor.run():
	 (transformer.s(*item, source_definition=source_definition) | loader.s(source_definition=source_definition)).delay()

Example #2

0

Show file

File: pipeline.py Project: soeterbroekj/open-cultuur-data

def setup_pipeline(source_definition):
    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    loader = load_object(source_definition['loader'])()

    for item in extractor.run():
        (transformer.s(*item, source_definition=source_definition)
         | loader.s(source_definition=source_definition)).delay()

Example #3

0

Show file

File: ggm.py Project: openstate/open-raadsinformatie

    def transform_item(self, raw_item_content_type, raw_item, item,
                       class_name=False):

        if not class_name:
            class_name = item.xpath("local-name()")

        if class_name in self.source_definition['mapping']:
            item_source = self.source_definition['mapping'][class_name]
            item_class = item_source['item']
        else:
            log.info('Skipping %s, does not exist in mapping' % class_name)
            return []

        items = list()
        if 'sub_items' in item_source:
            for key, path in item_source['sub_items'].items():
                for sub_item in item.xpath(path):
                    items += self.transform_item(raw_item_content_type,
                                                 etree.tostring(sub_item),
                                                 sub_item, class_name=key)

        item_class = load_object(item_class)
        item = item_class(self.source_definition, raw_item_content_type,
                          raw_item, item, unicode(item_source['doc_type']))

        self.add_resolveable_media_urls(item)

        return [(
            item.get_combined_object_id(),
            item.get_object_id(),
            item.get_combined_index_doc(),
            item.get_index_doc(),
            item.doc_type
        )] + items

Example #4

0

Show file

    def transform_item(self,
                       raw_item_content_type,
                       raw_item,
                       item,
                       class_name=False):

        if not class_name:
            class_name = item.xpath("local-name()")

        if class_name in self.source_definition['mapping']:
            item_source = self.source_definition['mapping'][class_name]
            item_class = item_source['item']
        else:
            log.info('Skipping %s, does not exist in mapping' % class_name)
            return []

        items = list()
        if 'sub_items' in item_source:
            for key, path in item_source['sub_items'].items():
                for sub_item in item.xpath(path):
                    items += self.transform_item(raw_item_content_type,
                                                 etree.tostring(sub_item),
                                                 sub_item,
                                                 class_name=key)

        item_class = load_object(item_class)
        item = item_class(self.source_definition, raw_item_content_type,
                          raw_item, item, unicode(item_source['doc_type']))

        self.add_resolveable_media_urls(item)

        return [(item.get_combined_object_id(), item.get_object_id(),
                 item.get_combined_index_doc(), item.get_index_doc(),
                 item.doc_type)] + items

Example #5

0

Show file

 def get_model_class(properties):
     """
     Finds the "type" property in the list of properties and imports the model class of that name.
     """
     for _property in properties:
         if _property['predicate'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
             try:
                 return load_object('ocd_backend.models.%s' % _property['value'])
             except ImportError:
                 raise ImportError('Unable to import class "ocd_backend.models.%s"' % _property['value'])
     raise ValueError('Unable to get model class: Object contains no "type" property.')

Example #6

0

Show file

File: data_sync.py Project: monty241/open-raadsinformatie

    def _init_extractor_from_source(self, source_name):
        """
        Initializes an extractor from a specified source name.
        """
        try:
            source = [s for s in self.sources if s['id'] == source_name][0]
        except IndexError as e:
            source = None

        if source is None:
            return

        extractor_klass = load_object(source['extractor'])
        return extractor_klass(source)

Example #7

0

Show file

File: __init__.py Project: SLKTH/open-raadsinformatie

    def run(self, *args, **kwargs):
        """Start transformation of a single item.

        This method is called by the extractor and expects args to
        contain the content-type and the original item (as a string).
        Kwargs should contain the ``source_definition`` dict.

        :returns: the output of :py:meth:`~BaseTransformer.transform_item`
        """
        self.source_definition = kwargs['source_definition']
        self.item_class = load_object(kwargs['source_definition']['item'])
        self.run_node = kwargs.get('run_node')

        item = self.deserialize_item(*args)  # pylint: disable=no-value-for-parameter
        return self.transform_item(*args, item=item)  # pylint: disable=no-value-for-parameter

Example #8

0

Show file

File: __init__.py Project: openstate/open-raadsinformatie

    def run(self, *args, **kwargs):
        """Start transformation of a single item.

        This method is called by the extractor and expects args to
        contain the content-type and the original item (as a string).
        Kwargs should contain the ``source_definition`` dict.

        :returns: the output of :py:meth:`~BaseTransformer.transform_item`
        """
        self.source_definition = kwargs['source_definition']
        self.item_class = load_object(kwargs['source_definition']['item'])
        self.run_node = kwargs.get('run_node')

        item = self.deserialize_item(*args)  # pylint: disable=no-value-for-parameter
        return self.transform_item(*args, item=item)  # pylint: disable=no-value-for-parameter

Example #9

0

Show file

File: __init__.py Project: soeterbroekj/open-cultuur-data

    def run(self, *args, **kwargs):
        """Start tranformation of a single item.

        This method is called by the extractor and expects args to
        contain the content-type and the original item (as a string).
        Kwargs should contain the ``source_definition`` dict.

        :type raw_item_content_type: string
        :param raw_item_content_type: the content-type of the data
            retrieved from the source (e.g. ``application/json``)
        :type raw_item: string
        :param raw_item: the data in it's original format, as retrieved
            from the source (as a string)
        :param source_definition: The configuration of a single source in
            the form of a dictionary (as defined in the settings).
        :type source_definition: dict.
        :returns: the output of :py:meth:`~BaseTransformer.transform_item`
        """
        self.source_definition = kwargs['source_definition']
        self.item_class = load_object(kwargs['source_definition']['item'])

        item = self.deserialize_item(*args)
        return self.transform_item(*args, item=item)

Example #10

0

Show file

File: __init__.py Project: madcsaba/open-raadsinformatie

    def run(self, *args, **kwargs):
        """Start transformation of a single item.

        This method is called by the extractor and expects args to
        contain the content-type and the original item (as a string).
        Kwargs should contain the ``source_definition`` dict.

        :type raw_item_content_type: string
        :param raw_item_content_type: the content-type of the data
            retrieved from the source (e.g. ``application/json``)
        :type raw_item: string
        :param raw_item: the data in it's original format, as retrieved
            from the source (as a string)
        :param source_definition: The configuration of a single source in
            the form of a dictionary (as defined in the settings).
        :type source_definition: dict.
        :returns: the output of :py:meth:`~BaseTransformer.transform_item`
        """
        self.source_definition = kwargs['source_definition']
        self.item_class = load_object(kwargs['source_definition']['item'])

        item = self.deserialize_item(*args)
        return self.transform_item(*args, item=item)

Example #11

0

Show file

File: test_ocd_backend_utils_misc.py Project: TMeerhof/open-cultuur-data

def _load_source(source):
    for phase in ['extractor', 'transformer', 'item', 'loader']:
        module = load_object(source[phase])

Example #12

0

Show file

File: mixins.py Project: illc-uva/open-raadsinformatie

 def cleanup(self, **kwargs):
     cleanup_task = load_object(self.source_definition.get('cleanup'))()
     cleanup_task.delay(**kwargs)

Example #13

0

Show file

File: mixins.py Project: madcsaba/open-raadsinformatie

 def cleanup(self, **kwargs):
     cleanup_task = load_object(self.source_definition.get("cleanup"))()
     cleanup_task.delay(**kwargs)

Example #14

0

Show file

def setup_pipeline(source_definition):
    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=settings.DEFAULT_INDEX_PREFIX,
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    new_index_name = '{index_alias}_{now}'.format(
        index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
    )

    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in
                 source_definition['enrichers']]
    loader = load_object(source_definition['loader'])()

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias
    }

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    try:
        for item in extractor.run():
            # Generate an identifier for each chain, and record that in
            # {}_chains, so that we can know for sure when all tasks
            # from an extractor have finished
            params['chain_id'] = uuid4().hex
            celery_app.backend.add_value_to_set(set_name=run_identifier_chains,
                                                value=params['chain_id'])

            item_chain = chain()

            # Tranform
            item_chain |= transformer.s(
                *item,
                source_definition=source_definition,
                **params
            )

            # Enrich
            for enricher_task, enricher_settings in enrichers:
                item_chain |= enricher_task.s(
                    source_definition=source_definition,
                    enricher_settings=enricher_settings,
                    **params
                )

            # Load
            item_chain |= loader.s(
                source_definition=source_definition,
                **params
            )

            item_chain.delay()
    except:
        logger.error('An exception has occured in the "{extractor}" extractor. '
                     'Deleting index "{index}" and setting status of run '
                     'identifier "{run_identifier}" to "error".'
                     .format(index=new_index_name,
                             run_identifier=params['run_identifier'],
                             extractor=source_definition['extractor']))

        celery_app.backend.set(params['run_identifier'], 'error')
        raise

    celery_app.backend.set(params['run_identifier'], 'done')

Example #15

0

Show file

File: pipeline.py Project: openstate/open-raadsinformatie

def setup_pipeline(source_definition):
    logger.info('Starting pipeline for source: %s' % source_definition.get('id'))

    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=settings.DEFAULT_INDEX_PREFIX,
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    # Check if the source specifies that any update should be added to
    # the current index instead of a new one
    if source_definition.get('keep_index_on_update'):
        new_index_name = current_index_name
    else:
        new_index_name = '{index_alias}_{now}'.format(
            index_alias=index_alias,
            now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
        )

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias
    }

    logger.debug('Starting run with identifier %s' % params['run_identifier'])

    #run = Run(RunIdentifier, params['run_identifier'], 'ori')
    #run.save()
    #params['run_node'] = run

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    # we can have multiple pipelines. but for compatibility and readability
    # use the source definition if no specific pipelines have been defined
    pipelines = source_definition.get('pipelines', None) or [source_definition]

    pipeline_definitions = {}
    pipeline_extractors = {}
    pipeline_extensions = {}
    pipeline_transformers = {}
    pipeline_enrichers = {}
    pipeline_loaders = {}

    for pipeline in pipelines:
        if 'id' not in pipeline:
            raise ConfigurationError("Each pipeline must have an id field.")

        # adjusted source definitionsv per pipeline. This way you can for
        # example change the index on a pipeline basis
        pipeline_definitions[pipeline['id']] = deepcopy(source_definition)
        pipeline_definitions[pipeline['id']].update(pipeline)

        # initialize the ETL classes, per pipeline
        pipeline_extractors[pipeline['id']] = load_object(
            pipeline_definitions[pipeline['id']]['extractor'])

        pipeline_extensions[pipeline['id']] = [
            load_object(cls) for cls in
            pipeline_definitions[pipeline['id']].get('extensions', [])]

        if pipeline.get('transformer'):
            pipeline_transformers[pipeline['id']] = load_object(
                pipeline['transformer'])()

        pipeline_enrichers[pipeline['id']] = [
            (load_object(enricher[0])(), enricher[1] or {}) for enricher in
            pipeline_definitions[pipeline['id']].get('enrichers', [])]

        pipeline_loaders[pipeline['id']] = list()
        for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \
                [pipeline_definitions[pipeline['id']].get('loader', None)]:
            if cls:
                pipeline_loaders[pipeline['id']].append(load_object(cls)())

    result = None
    for pipeline in pipelines:
        try:
            # The first extractor should be a generator instead of a task
            for item in pipeline_extractors[pipeline['id']](
                    source_definition=pipeline_definitions[pipeline['id']]).run():

                step_chain = list()

                params['chain_id'] = uuid4().hex
                celery_app.backend.add_value_to_set(
                    set_name=run_identifier_chains,
                    value=params['chain_id'])

                # Remaining extractors
                for extension in pipeline_extensions[pipeline['id']]:
                    step_chain.append(extension().s(
                        *item,
                        source_definition=pipeline_definitions[pipeline['id']],
                        **params
                    )
                    )
                    # Prevent old item being passed down to next steps
                    item = []

                # Transformers
                if pipeline_transformers.get(pipeline['id']):
                    step_chain.append(pipeline_transformers[pipeline['id']].s(
                        *item,
                        source_definition=pipeline_definitions[pipeline['id']],
                        **params)
                    )

                # Enrichers
                for enricher_task, enricher_settings in pipeline_enrichers[
                    pipeline['id']
                ]:
                    step_chain.append(enricher_task.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        enricher_settings=enricher_settings,
                        **params
                    )
                    )

                # Loaders
                # Multiple loaders to enable to save to different stores
                initialized_loaders = []
                for loader in pipeline_loaders[pipeline['id']]:
                    initialized_loaders.append(loader.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        **params))
                step_chain.append(group(initialized_loaders))

                result = chain(step_chain).delay()
        except KeyboardInterrupt:
            logger.warning('KeyboardInterrupt received. Stopping the program.')
            exit()
        except Exception, e:
            logger.error('An exception has occured in the "{extractor}" extractor.'
                         ' Setting status of run identifier "{run_identifier}" to '
                         '"error":\n{message}'
                         .format(index=params['new_index_name'],
                                 run_identifier=params['run_identifier'],
                                 extractor=pipeline_extractors[pipeline['id']],
                                 message=e,
                                 )
                         )

            celery_app.backend.set(params['run_identifier'], 'error')
            raise

Example #16

0

Show file

File: test_ocd_backend_utils_misc.py Project: siccovansas/open-cultuur-data

def _load_source(source):
    for phase in ['extractor', 'transformer', 'item', 'loader']:
        module = load_object(source[phase])

Example #17

0

Show file

File: pipeline.py Project: aolieman/open-raadsinformatie

def setup_pipeline(source_definition):
    logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id')))

    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX),
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    # Check if the source specifies that any update should be added to
    # the current index instead of a new one
    if source_definition.get('keep_index_on_update'):
        new_index_name = current_index_name
    else:
        new_index_name = '{index_alias}_{now}'.format(
            index_alias=index_alias,
            now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
        )

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias,
    }

    logger.debug('[%s] Starting run with identifier %s' % (source_definition['key'], params['run_identifier']))

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    # we can have multiple pipelines. but for compatibility and readability
    # use the source definition if no specific pipelines have been defined
    pipelines = source_definition.get('pipelines', None) or [source_definition]

    pipeline_definitions = {}
    pipeline_extractors = {}
    pipeline_transformers = {}
    pipeline_enrichers = {}
    pipeline_loaders = {}

    for pipeline in pipelines:
        if 'id' not in pipeline:
            raise ConfigurationError("Each pipeline must have an id field.")

        # adjusted source definitions per pipeline. This way you can for
        # example change the index on a pipeline basis
        pipeline_definitions[pipeline['id']] = deepcopy(source_definition)
        pipeline_definitions[pipeline['id']].update(pipeline)

        # initialize the ETL classes, per pipeline
        pipeline_extractors[pipeline['id']] = load_object(
            pipeline_definitions[pipeline['id']]['extractor'])

        pipeline_transformers[pipeline['id']] = load_object(
            pipeline_definitions[pipeline['id']]['transformer'])

        pipeline_enrichers[pipeline['id']] = [
            (load_object(enricher) or {}) for enricher in
            pipeline_definitions[pipeline['id']].get('enrichers', [])]

        pipeline_loaders[pipeline['id']] = list()
        for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \
                [pipeline_definitions[pipeline['id']].get('loader', None)]:
            if cls:
                pipeline_loaders[pipeline['id']].append(load_object(cls))

    result = None
    for pipeline in pipelines:
        try:
            # The first extractor should be a generator instead of a task
            for item in pipeline_extractors[pipeline['id']](
                    source_definition=pipeline_definitions[pipeline['id']]).run():
                step_chain = list()

                params['chain_id'] = uuid4().hex
                params['start_time'] = datetime.now()

                celery_app.backend.add_value_to_set(
                    set_name=run_identifier_chains,
                    value=params['chain_id'])

                # Transformers
                if pipeline_transformers.get(pipeline['id']):
                    step_chain.append(pipeline_transformers[pipeline['id']].s(
                        *item,
                        source_definition=pipeline_definitions[pipeline['id']],
                        **params)
                    )

                # Enrichers
                for enricher_task in pipeline_enrichers[
                    pipeline['id']
                ]:
                    step_chain.append(enricher_task.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        **params
                    )
                    )

                # Loaders
                # Multiple loaders to enable to save to different stores
                initialized_loaders = []
                for loader in pipeline_loaders[pipeline['id']]:
                    initialized_loaders.append(loader.s(
                        source_definition=pipeline_definitions[
                            pipeline['id']],
                        **params))
                step_chain.append(group(initialized_loaders))

                result = chain(step_chain).delay()
        except KeyboardInterrupt:
            logger.warning('KeyboardInterrupt received. Stopping the program.')
            exit()
        except Exception, e:
            logger.error('[{site_name}] Pipeline has failed. Setting status of '
                         'run identifier "{run_identifier}" to "error":\n{message}'
                         .format(index=params['new_index_name'],
                                 run_identifier=params['run_identifier'],
                                 extractor=pipeline_extractors[pipeline['id']],
                                 message=e,
                                 site_name=source_definition['key'],
                                 )
                         )

            celery_app.backend.set(params['run_identifier'], 'error')

            # Reraise the exception so celery can autoretry
            raise

Example #18

0

Show file

File: pipeline.py Project: madcsaba/open-raadsinformatie

def setup_pipeline(source_definition):
    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=settings.DEFAULT_INDEX_PREFIX,
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)

    # Find the current index name behind the alias specified in the config
    try:
        current_index_aliases = es.indices.get_alias(name=index_alias)
    except NotFoundError:
        raise ConfigurationError('Index with alias "{index_alias}" does '
                                 'not exist'.format(index_alias=index_alias))

    current_index_name = current_index_aliases.keys()[0]
    # Check if the source specifies that any update should be added to
    # the current index instead of a new one
    if source_definition['keep_index_on_update']:
        new_index_name = current_index_name
    else:
        new_index_name = '{index_alias}_{now}'.format(
            index_alias=index_alias,
            now=datetime.utcnow().strftime('%Y%m%d%H%M%S')
        )

    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in
                 source_definition['enrichers']]
    loader = load_object(source_definition['loader'])()

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias
    }

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    try:
        for item in extractor.run():
            # Generate an identifier for each chain, and record that in
            # {}_chains, so that we can know for sure when all tasks
            # from an extractor have finished
            params['chain_id'] = uuid4().hex
            celery_app.backend.add_value_to_set(set_name=run_identifier_chains,
                                                value=params['chain_id'])

            item_chain = chain()

            # Tranform
            item_chain |= transformer.s(
                *item,
                source_definition=source_definition,
                **params
            )

            # Enrich
            for enricher_task, enricher_settings in enrichers:
                item_chain |= enricher_task.s(
                    source_definition=source_definition,
                    enricher_settings=enricher_settings,
                    **params
                )

            # Load
            item_chain |= loader.s(
                source_definition=source_definition,
                **params
            )

            item_chain.delay()
    except:
        logger.error('An exception has occured in the "{extractor}" extractor. '
                     'Setting status of run identifier "{run_identifier}" to '
                     '"error".'
                     .format(index=new_index_name,
                             run_identifier=params['run_identifier'],
                             extractor=source_definition['extractor']))

        celery_app.backend.set(params['run_identifier'], 'error')
        raise

    celery_app.backend.set(params['run_identifier'], 'done')

Example #19

0

Show file

def setup_pipeline(source_definition):
    current_date_and_time = datetime.utcnow()

    # first thhe secondary index
    index_alias = initialize_index(source_definition, current_date_and_time)
    current_index_name = get_current_index(index_alias)
    new_index_name = get_new_index(source_definition, current_index_name,
                                   index_alias, current_date_and_time)

    # now the combined index
    index_names = source_definition.get('index_names', ['combined_index'])

    current_index_names = []
    new_index_names = []
    additional_aliases = []
    for cur_index in index_names:
        additional_source_definition = {
            'id': 'test_source',
            'index_name': cur_index,
            'keep_index_on_update': source_definition['keep_index_on_update']
        }
        additional_alias = '%s_%s' % (
            settings.DEFAULT_INDEX_PREFIX,
            cur_index,
        )
        additional_aliases.append(additional_alias)
        initialize_index(additional_source_definition, current_date_and_time)
        current_additional_index_name = get_current_index(additional_alias)
        new_additional_index_name = get_new_index(
            additional_source_definition, current_additional_index_name,
            additional_alias, current_date_and_time)
        current_index_names.append(current_additional_index_name)
        new_index_names.append(new_additional_index_name)
        print "Setting up alias for %s (-> %s)" % (
            cur_index,
            new_additional_index_name,
        )

    # now load objects and prepare the run ...
    extractor = load_object(source_definition['extractor'])(source_definition)
    transformer = load_object(source_definition['transformer'])()
    enrichers = [(load_object(enricher[0])(), enricher[1])
                 for enricher in source_definition['enrichers']]
    loader = load_object(source_definition['loader'])()

    # Parameters that are passed to each task in the chain
    params = {
        'run_identifier': 'pipeline_{}'.format(uuid4().hex),
        'current_index_name': current_index_name,
        'new_index_name': new_index_name,
        'index_alias': index_alias,
        'index_aliases': additional_aliases,
        'current_index_names': current_index_names,
        'new_index_names': new_index_names
    }

    celery_app.backend.set(params['run_identifier'], 'running')
    run_identifier_chains = '{}_chains'.format(params['run_identifier'])

    try:
        for item in extractor.run():
            # Generate an identifier for each chain, and record that in
            # {}_chains, so that we can know for sure when all tasks
            # from an extractor have finished
            params['chain_id'] = uuid4().hex
            celery_app.backend.add_value_to_set(set_name=run_identifier_chains,
                                                value=params['chain_id'])

            item_chain = chain()

            # Tranform
            item_chain |= transformer.s(*item,
                                        source_definition=source_definition,
                                        **params)

            # Enrich
            for enricher_task, enricher_settings in enrichers:
                item_chain |= enricher_task.s(
                    source_definition=source_definition,
                    enricher_settings=enricher_settings,
                    **params)

            # Load
            item_chain |= loader.s(source_definition=source_definition,
                                   **params)

            item_chain.delay()
    except:
        logger.error(
            'An exception has occured in the "{extractor}" extractor. '
            'Setting status of run identifier "{run_identifier}" to '
            '"error".'.format(index=new_index_name,
                              run_identifier=params['run_identifier'],
                              extractor=source_definition['extractor']))

        celery_app.backend.set(params['run_identifier'], 'error')
        raise

    celery_app.backend.set(params['run_identifier'], 'done')