def load_dump(collection_dump, collection_name): """ Restore an index from a dump file. :param collection_dump: Path to a local gzipped dump to load. :param collection_name: Name for the local index to restore the dump to. Optional; will be derived from the dump name, at your own risk. Note that the pipeline will add a "oad_" prefix string to the collection name, to ensure the proper mapping and settings are applied. """ available_dumps = glob(os.path.join(LOCAL_DUMPS_DIR, '*/*.gz')) if not collection_dump: choices = [] for i, dump in enumerate(available_dumps): choices.append(unicode(i + 1)) click.secho('{i}) {dump}'.format(i=i + 1, dump=dump), fg='green') dump_idx = click.prompt('Choose one of the dumps listed above', type=click.Choice(choices)) collection_dump = available_dumps[int(dump_idx) - 1] collection = os.path.abspath(collection_dump) collection_id = '_'.join( collection.split('/')[-1].split('.')[0].split('_')[:2]) if not collection_name: collection_name = collection_id.replace('oad_', '') source_definition = { 'id': collection_id, 'extractor': 'ocd_backend.extractors.staticfile.StaticJSONDumpExtractor', 'transformer': 'ocd_backend.transformers.BaseTransformer', 'loader': 'ocd_backend.loaders.ElasticsearchLoader', "cleanup": "ocd_backend.tasks.CleanupElasticsearch", 'item': 'ocd_backend.items.LocalDumpItem', "enrichers": [[ "ocd_backend.enrichers.media_enricher.MediaEnricher", { "tasks": ["media_type", "image_metadata"] } ]], 'dump_path': collection, 'index_name': collection_name } click.secho(str(source_definition), fg='yellow') setup_pipeline(source_definition) click.secho( 'Queued items from {}. Please make sure your Celery workers' ' are running, so the loaded items are processed.'.format(collection), fg='green')
def extract_start(source_id, subitem, entiteit, sources_config): """ Start extraction for a pipeline specified by ``source_id`` defined in ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``. When ``id`` is specified in the source it will trigger old-style json behaviour for backward compatibility reasons. Otherwise new-style yaml is assumed, which looks for ``entities`` in the source to determine the order in which entities are processed. If no ``entities`` are found in the source, all subitems of the source will be processed, if any. If one or more ``--subitem`` is specified, only those subitems will be processed. When one or more ``--entiteit`` is specified, only those entities will be processed. By default, all entities are processed. Note: ``--subitem`` and ``--entiteit`` only work in new-style yaml configurations. :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` :param source_id: identifier used in ``--sources_config`` to describe pipeline :param subitem: one ore more items under the parent `source_id`` to specify which subitems should be run :param entiteit: one ore more entity arguments to specify which entities should be run """ sources = load_sources_config(sources_config) # Find the requested source definition in the list of available sources source = sources.get(source_id) # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return # Check for old-style json sources if 'id' in source: setup_pipeline(source) return # New-style behaviour selected_sources = dict() if 'entities' not in source: if subitem: for s in subitem: # Add the specified subs selected_sources[s] = source[s] else: # All sub sources if no subs are specified selected_sources = source else: # Only one main source selected_sources = {source_id: source} # Processing each item for source_id, source in selected_sources.items(): for item in source.get('entities'): if (not entiteit and item) or (entiteit and item.get('entity') in entiteit): new_source = deepcopy(source) new_source.update(item) setup_pipeline(new_source)
def load_dump(collection_dump, collection_name): """ Restore an index from a dump file. :param collection_dump: Path to a local gzipped dump to load. :param collection_name: Name for the local index to restore the dump to. Optional; will be derived from the dump name, at your own risk. Note that the pipeline will add a "ocd_" prefix string to the collection name, to ensure the proper mapping and settings are applied. """ available_dumps = glob(os.path.join(LOCAL_DUMPS_DIR, '*/*.gz')) if not collection_dump: choices = [] for i, dump in enumerate(available_dumps): choices.append(unicode(i+1)) click.secho('{i}) {dump}'.format(i=i+1, dump=dump), fg='green') dump_idx = click.prompt('Choose one of the dumps listed above', type=click.Choice(choices)) collection_dump = available_dumps[int(dump_idx) - 1] collection = os.path.abspath(collection_dump) collection_id = '_'.join(collection.split('/')[-1].split('.')[0].split('_')[:2]) if not collection_name: collection_name = collection_id.replace('ocd_', '') source_definition = { 'id': collection_id, 'extractor': 'ocd_backend.extractors.staticfile.StaticJSONDumpExtractor', 'transformer': 'ocd_backend.transformers.BaseTransformer', 'loader': 'ocd_backend.loaders.ElasticsearchLoader', "cleanup": "ocd_backend.tasks.CleanupElasticsearch", 'item': 'ocd_backend.items.LocalDumpItem', "enrichers": [ [ "ocd_backend.enrichers.media_enricher.MediaEnricher", { "tasks": ["media_type", "image_metadata"] } ] ], 'dump_path': collection, 'index_name': collection_name } click.secho(str(source_definition), fg='yellow') setup_pipeline(source_definition) click.secho('Queued items from {}. Please make sure your Celery workers' ' are running, so the loaded items are processed.'.format(collection), fg='green')
def extract_start(source_id, sources_config): """Start extraction for a specified source.""" if not sources_config: sources_config = SOURCES_CONFIG_FILE sources = load_sources_config(sources_config) # Find the requested source defenition in the list of available sources source = None for candidate_source in sources: if candidate_source['id'] == source_id: source = candidate_source continue # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return setup_pipeline(source)
def extract_start(source_id, sources_config): """ Start extraction for a pipeline specified by ``source_id`` defined in ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``. :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` :param source_id: identifier used in ``--sources_config`` to describe pipeline """ sources = load_sources_config(sources_config) # Find the requested source definition in the list of available sources source = None for candidate_source in sources: if candidate_source['id'] == source_id: source = candidate_source continue # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return setup_pipeline(source)