Esempio n. 1
0
def load_dump(collection_dump, collection_name):
    """
    Restore an index from a dump file.

    :param collection_dump: Path to a local gzipped dump to load.
    :param collection_name: Name for the local index to restore the dump to. Optional; will be derived from the dump name, at your own risk. Note that the pipeline will add a "oad_" prefix string to the collection name, to ensure the proper mapping and settings are applied.
    """
    available_dumps = glob(os.path.join(LOCAL_DUMPS_DIR, '*/*.gz'))
    if not collection_dump:
        choices = []
        for i, dump in enumerate(available_dumps):
            choices.append(unicode(i + 1))
            click.secho('{i}) {dump}'.format(i=i + 1, dump=dump), fg='green')
        dump_idx = click.prompt('Choose one of the dumps listed above',
                                type=click.Choice(choices))
        collection_dump = available_dumps[int(dump_idx) - 1]

    collection = os.path.abspath(collection_dump)
    collection_id = '_'.join(
        collection.split('/')[-1].split('.')[0].split('_')[:2])

    if not collection_name:
        collection_name = collection_id.replace('oad_', '')

    source_definition = {
        'id':
        collection_id,
        'extractor':
        'ocd_backend.extractors.staticfile.StaticJSONDumpExtractor',
        'transformer':
        'ocd_backend.transformers.BaseTransformer',
        'loader':
        'ocd_backend.loaders.ElasticsearchLoader',
        "cleanup":
        "ocd_backend.tasks.CleanupElasticsearch",
        'item':
        'ocd_backend.items.LocalDumpItem',
        "enrichers": [[
            "ocd_backend.enrichers.media_enricher.MediaEnricher", {
                "tasks": ["media_type", "image_metadata"]
            }
        ]],
        'dump_path':
        collection,
        'index_name':
        collection_name
    }

    click.secho(str(source_definition), fg='yellow')

    setup_pipeline(source_definition)

    click.secho(
        'Queued items from {}. Please make sure your Celery workers'
        ' are running, so the loaded items are processed.'.format(collection),
        fg='green')
Esempio n. 2
0
def extract_start(source_id, subitem, entiteit, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.
    When ``id`` is specified in the source it will trigger old-style json behaviour for backward compatibility reasons.

    Otherwise new-style yaml is assumed, which looks for ``entities`` in the source to determine the order in which entities are processed.
    If no ``entities`` are found in the source, all subitems of the source will be processed, if any.
    If one or more ``--subitem`` is specified, only those subitems will be processed.
    When one or more ``--entiteit`` is specified, only those entities will be processed. By default, all entities are processed.

    Note: ``--subitem`` and ``--entiteit`` only work in new-style yaml configurations.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    :param subitem: one ore more items under the parent `source_id`` to specify which subitems should be run
    :param entiteit: one ore more entity arguments to specify which entities should be run
    """

    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = sources.get(source_id)

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    # Check for old-style json sources
    if 'id' in source:
        setup_pipeline(source)
        return

    # New-style behaviour
    selected_sources = dict()
    if 'entities' not in source:
        if subitem:
            for s in subitem:
                # Add the specified subs
                selected_sources[s] = source[s]
        else:
            # All sub sources if no subs are specified
            selected_sources = source
    else:
        # Only one main source
        selected_sources = {source_id: source}

    # Processing each item
    for source_id, source in selected_sources.items():
        for item in source.get('entities'):
            if (not entiteit and item) or (entiteit and item.get('entity') in entiteit):
                new_source = deepcopy(source)
                new_source.update(item)
                setup_pipeline(new_source)
Esempio n. 3
0
def load_dump(collection_dump, collection_name):
    """
    Restore an index from a dump file.

    :param collection_dump: Path to a local gzipped dump to load.
    :param collection_name: Name for the local index to restore the dump to. Optional; will be derived from the dump name, at your own risk. Note that the pipeline will add a "ocd_" prefix string to the collection name, to ensure the proper mapping and settings are applied.
    """
    available_dumps = glob(os.path.join(LOCAL_DUMPS_DIR, '*/*.gz'))
    if not collection_dump:
        choices = []
        for i, dump in enumerate(available_dumps):
            choices.append(unicode(i+1))
            click.secho('{i}) {dump}'.format(i=i+1, dump=dump), fg='green')
        dump_idx = click.prompt('Choose one of the dumps listed above',
                                type=click.Choice(choices))
        collection_dump = available_dumps[int(dump_idx) - 1]

    collection = os.path.abspath(collection_dump)
    collection_id = '_'.join(collection.split('/')[-1].split('.')[0].split('_')[:2])

    if not collection_name:
        collection_name = collection_id.replace('ocd_', '')

    source_definition = {
        'id': collection_id,
        'extractor': 'ocd_backend.extractors.staticfile.StaticJSONDumpExtractor',
        'transformer': 'ocd_backend.transformers.BaseTransformer',
        'loader': 'ocd_backend.loaders.ElasticsearchLoader',
        "cleanup": "ocd_backend.tasks.CleanupElasticsearch",
        'item': 'ocd_backend.items.LocalDumpItem',
        "enrichers": [
            [
                "ocd_backend.enrichers.media_enricher.MediaEnricher",
                {
                    "tasks": ["media_type", "image_metadata"]
                }
            ]
        ],
        'dump_path': collection,
        'index_name': collection_name
    }

    click.secho(str(source_definition), fg='yellow')

    setup_pipeline(source_definition)

    click.secho('Queued items from {}. Please make sure your Celery workers'
                ' are running, so the loaded items are processed.'.format(collection),
                fg='green')
Esempio n. 4
0
def extract_start(source_id, sources_config):
    """Start extraction for a specified source."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    # Find the requested source defenition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
Esempio n. 5
0
def extract_start(source_id, sources_config):
    """Start extraction for a specified source."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    # Find the requested source defenition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
Esempio n. 6
0
def extract_start(source_id, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    """
    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
Esempio n. 7
0
def extract_start(source_id, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    """
    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)