def __init__(self, *args, **kwargs): super(StaticXmlExtractor, self).__init__(*args, **kwargs) if 'item_xpath' not in self.source_definition: raise ConfigurationError('Missing \'item_xpath\' definition') if not self.source_definition['item_xpath']: raise ConfigurationError('The \'item_xpath\' is empty') self.item_xpath = self.source_definition['item_xpath']
def __init__(self, *args, **kwargs): super(StaticFileBaseExtractor, self).__init__(*args, **kwargs) if 'file_url' not in self.source_definition: raise ConfigurationError('Missing \'file_url\' definition') if not self.source_definition['file_url']: raise ConfigurationError('The \'file_url\' is empty') self.file_url = self.source_definition['file_url']
def __init__(self, *args, **kwargs): super(PagingHTMLExtractor, self).__init__(*args, **kwargs) if 'next_page_xpath' not in self.source_definition: raise ConfigurationError('Missing \'next_page_xpath\' definition') if not self.source_definition['next_page_xpath']: raise ConfigurationError('The \'next_page_xpath\' is empty') self.next_page_xpath = self.source_definition['next_page_xpath'] # default max 5 pages self.next_page_max_count = self.source_definition.get('next_page_max_count', 5)
def start(self, *args, **kwargs): self.index_name = kwargs.get('new_index_name') if not self.index_name: raise ConfigurationError('The name of the index is not provided') return super(ElasticsearchLoader, self).start(*args, **kwargs)
def __init__(self, *args, **kwargs): super(LocalPathBaseExtractor, self).__init__(*args, **kwargs) if 'path' not in self.source_definition: raise ConfigurationError('Missing \'path\' definition') if not self.source_definition['path']: raise ConfigurationError('The \'path\' is empty') if 'pattern' not in self.source_definition: raise ConfigurationError('Missing \'pattern\' definition') if not self.source_definition['pattern']: raise ConfigurationError('The \'pattern\' is empty') self.path = self.source_definition['path'] self.pattern = self.source_definition['pattern']
def __init__(self, *args, **kwargs): super(GreenValleyBaseExtractor, self).__init__(*args, **kwargs) self.base_url = None self.username = None self.key = None self.hash = None for opt in ['base_url', 'username', 'key', 'hash']: opt_key = 'greenvalley_%s' % (opt, ) if opt_key not in self.source_definition: raise ConfigurationError('Missing \'%s\' definition' % (opt_key, )) if not self.source_definition[opt_key]: raise ConfigurationError('The \'%s\' is empty' % (opt_key, )) setattr(self, opt, self.source_definition[opt_key])
def run(self, *args, **kwargs): self.current_index_name = kwargs.get('current_index_name') self.index_name = kwargs.get('new_index_name') self.alias = kwargs.get('index_alias') if not self.index_name: raise ConfigurationError('The name of the index is not provided') return super(ElasticsearchLoader, self).run(*args, **kwargs)
def get_current_index(index_alias): # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) return current_index_aliases.keys()[0]
def __init__(self, *args, **kwargs): super(FacebookExtractor, self).__init__(*args, **kwargs) if 'facebook' not in self.source_definition: raise ConfigurationError('Missing \'facebook\' definition') for fld in [ 'api_version', 'app_id', 'app_secret', 'graph_url' ]: if fld not in self.source_definition['facebook']: raise ConfigurationError( 'Missing \'%s\' definition of facebook' % (fld,)) if not self.source_definition['facebook'][fld]: raise ConfigurationError( 'The \'%s\' in facebook is empty' % (fld,)) setattr( self, 'fb_%s' % (fld,), self.source_definition['facebook'][fld])
def __init__(self, *args, **kwargs): super(StaticHtmlExtractor, self).__init__(*args, **kwargs) if 'item_xpath' not in self.source_definition: raise ConfigurationError('Missing \'item_xpath\' definition') if not self.source_definition['item_xpath']: raise ConfigurationError('The \'item_xpath\' is empty') self.item_xpath = self.source_definition['item_xpath'] if 'item_id_xpath' not in self.source_definition: raise ConfigurationError('Missing \'item_id_xpath\' definition') if not self.source_definition['item_id_xpath']: raise ConfigurationError('The \'item_id_xpath\' is empty') self.item_id_xpath = self.source_definition['item_id_xpath'] self.default_namespace = None if 'default_namespace' in self.source_definition: self.default_namespace = self.source_definition[ 'default_namespace']
def run(self, *args, **kwargs): self.current_index_name = kwargs.get('current_index_name') self.index_name = kwargs.get('new_index_name') self.alias = kwargs.get('index_alias') self.new_index_names = kwargs.get('new_index_names', [settings.COMBINED_INDEX]) try: self.combined_index_name = [ i for i in self.new_index_names if i.startswith('%s_' % (settings.COMBINED_INDEX, )) ][0] except IndexError as e: self.combined_index_name = settings.COMBINED_INDEX self.doc_type = kwargs['source_definition'].get('doc_type', 'item') if not self.index_name: raise ConfigurationError('The name of the index is not provided') return super(ElasticsearchLoader, self).run(*args, **kwargs)
def __init__(self, *args, **kwargs): super(StaticJSONDumpExtractor, self).__init__(*args, **kwargs) if not self.source_definition.get('dump_path'): raise ConfigurationError('Missing \'dump_path\' definition')
def setup_pipeline(source_definition): logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id'))) # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX), index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] # Check if the source specifies that any update should be added to # the current index instead of a new one if source_definition.get('keep_index_on_update'): new_index_name = current_index_name else: new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias, } logger.debug('[%s] Starting run with identifier %s' % (source_definition['key'], params['run_identifier'])) celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) # we can have multiple pipelines. but for compatibility and readability # use the source definition if no specific pipelines have been defined pipelines = source_definition.get('pipelines', None) or [source_definition] pipeline_definitions = {} pipeline_extractors = {} pipeline_transformers = {} pipeline_enrichers = {} pipeline_loaders = {} for pipeline in pipelines: if 'id' not in pipeline: raise ConfigurationError("Each pipeline must have an id field.") # adjusted source definitions per pipeline. This way you can for # example change the index on a pipeline basis pipeline_definitions[pipeline['id']] = deepcopy(source_definition) pipeline_definitions[pipeline['id']].update(pipeline) # initialize the ETL classes, per pipeline pipeline_extractors[pipeline['id']] = load_object( pipeline_definitions[pipeline['id']]['extractor']) pipeline_transformers[pipeline['id']] = load_object( pipeline_definitions[pipeline['id']]['transformer']) pipeline_enrichers[pipeline['id']] = [ (load_object(enricher) or {}) for enricher in pipeline_definitions[pipeline['id']].get('enrichers', [])] pipeline_loaders[pipeline['id']] = list() for cls in pipeline_definitions[pipeline['id']].get('loaders', None) or \ [pipeline_definitions[pipeline['id']].get('loader', None)]: if cls: pipeline_loaders[pipeline['id']].append(load_object(cls)) result = None for pipeline in pipelines: try: # The first extractor should be a generator instead of a task for item in pipeline_extractors[pipeline['id']]( source_definition=pipeline_definitions[pipeline['id']]).run(): step_chain = list() params['chain_id'] = uuid4().hex params['start_time'] = datetime.now() celery_app.backend.add_value_to_set( set_name=run_identifier_chains, value=params['chain_id']) # Transformers if pipeline_transformers.get(pipeline['id']): step_chain.append(pipeline_transformers[pipeline['id']].s( *item, source_definition=pipeline_definitions[pipeline['id']], **params) ) # Enrichers for enricher_task in pipeline_enrichers[ pipeline['id'] ]: step_chain.append(enricher_task.s( source_definition=pipeline_definitions[ pipeline['id']], **params ) ) # Loaders # Multiple loaders to enable to save to different stores initialized_loaders = [] for loader in pipeline_loaders[pipeline['id']]: initialized_loaders.append(loader.s( source_definition=pipeline_definitions[ pipeline['id']], **params)) step_chain.append(group(initialized_loaders)) result = chain(step_chain).delay() except KeyboardInterrupt: logger.warning('KeyboardInterrupt received. Stopping the program.') exit() except Exception, e: logger.error('[{site_name}] Pipeline has failed. Setting status of ' 'run identifier "{run_identifier}" to "error":\n{message}' .format(index=params['new_index_name'], run_identifier=params['run_identifier'], extractor=pipeline_extractors[pipeline['id']], message=e, site_name=source_definition['key'], ) ) celery_app.backend.set(params['run_identifier'], 'error') # Reraise the exception so celery can autoretry raise
def setup_pipeline(source_definition): # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=settings.DEFAULT_INDEX_PREFIX, index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name) # Find the current index name behind the alias specified in the config try: current_index_aliases = es.indices.get_alias(name=index_alias) except NotFoundError: raise ConfigurationError('Index with alias "{index_alias}" does ' 'not exist'.format(index_alias=index_alias)) current_index_name = current_index_aliases.keys()[0] new_index_name = '{index_alias}_{now}'.format( index_alias=index_alias, now=datetime.utcnow().strftime('%Y%m%d%H%M%S') ) extractor = load_object(source_definition['extractor'])(source_definition) transformer = load_object(source_definition['transformer'])() enrichers = [(load_object(enricher[0])(), enricher[1]) for enricher in source_definition['enrichers']] loader = load_object(source_definition['loader'])() # Parameters that are passed to each task in the chain params = { 'run_identifier': 'pipeline_{}'.format(uuid4().hex), 'current_index_name': current_index_name, 'new_index_name': new_index_name, 'index_alias': index_alias } celery_app.backend.set(params['run_identifier'], 'running') run_identifier_chains = '{}_chains'.format(params['run_identifier']) try: for item in extractor.run(): # Generate an identifier for each chain, and record that in # {}_chains, so that we can know for sure when all tasks # from an extractor have finished params['chain_id'] = uuid4().hex celery_app.backend.add_value_to_set(set_name=run_identifier_chains, value=params['chain_id']) item_chain = chain() # Tranform item_chain |= transformer.s( *item, source_definition=source_definition, **params ) # Enrich for enricher_task, enricher_settings in enrichers: item_chain |= enricher_task.s( source_definition=source_definition, enricher_settings=enricher_settings, **params ) # Load item_chain |= loader.s( source_definition=source_definition, **params ) item_chain.delay() except: logger.error('An exception has occured in the "{extractor}" extractor. ' 'Deleting index "{index}" and setting status of run ' 'identifier "{run_identifier}" to "error".' .format(index=new_index_name, run_identifier=params['run_identifier'], extractor=source_definition['extractor'])) celery_app.backend.set(params['run_identifier'], 'error') raise celery_app.backend.set(params['run_identifier'], 'done')