コード例 #1
0
ファイル: manage.py プロジェクト: openstate/open-cultuur-data
def qa_matrix(index, sources_config):
    #mapping = es.indices.get_mapping(index=index)
    fields = BaseItem.combined_index_fields
    sources = load_sources_config(sources_config)

    #pprint(fields)

    all_body = {
        'query': {
            'constant_score': {
                'query': {'match_all': {}}
            }
        },
        'aggs': {
            'source_id': {
                'terms': {'field': 'meta.source_id', 'size': len(sources)},
            }
        },
        'size': 0
    }
    all_result = es.search(index=index, body=all_body)

    all_counts = {b['key']: {
        'all': b['doc_count']} for b in
        all_result['aggregations']['source_id']['buckets']}

    for field in fields:
        body = {
            'query': {
                'filtered': {
                    'query': {
                        'constant_score': {
                            'query': {'match_all': {}}
                        }
                    },
                    'filter': {
                        'exists': {'field': field}
                    }
                }
            },
            'aggs': {
                'source_id': {
                    'terms': {'field': 'meta.source_id', 'size': len(sources)},
                }
            },
            'size': 0
        }
        result = es.search(index=index, body=body)
        for b in result['aggregations']['source_id']['buckets']:
            all_counts[b['key']][field] = b['doc_count']
        # print field
        # pprint(result)

    pprint(all_counts)

    print ("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] + fields.keys())))
    #print "{:<8} {:<15} {:<10}".format('Source',)
    for source_id, counts in all_counts.iteritems():
        print ("{:<24}" * (len(fields) + 2)).format(*([source_id, counts['all']] + [counts.get(c, 0) for c in sorted(fields)]))
コード例 #2
0
ファイル: manage.py プロジェクト: ajslaghu/open-cultuur-data
def extract_list_sources(sources_config):
    """Show a list of available sources."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    click.echo('Available sources:')
    for source in sources:
        click.echo(' - %s' % source['id'])
コード例 #3
0
def extract_list_sources(sources_config):
    """Show a list of available sources."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    click.echo('Available sources:')
    for source in sources:
        click.echo(' - %s' % source['id'])
コード例 #4
0
ファイル: manage.py プロジェクト: SLKTH/open-raadsinformatie
def extract_start(source_id, subitem, entiteit, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.
    When ``id`` is specified in the source it will trigger old-style json behaviour for backward compatibility reasons.

    Otherwise new-style yaml is assumed, which looks for ``entities`` in the source to determine the order in which entities are processed.
    If no ``entities`` are found in the source, all subitems of the source will be processed, if any.
    If one or more ``--subitem`` is specified, only those subitems will be processed.
    When one or more ``--entiteit`` is specified, only those entities will be processed. By default, all entities are processed.

    Note: ``--subitem`` and ``--entiteit`` only work in new-style yaml configurations.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    :param subitem: one ore more items under the parent `source_id`` to specify which subitems should be run
    :param entiteit: one ore more entity arguments to specify which entities should be run
    """

    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = sources.get(source_id)

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    # Check for old-style json sources
    if 'id' in source:
        setup_pipeline(source)
        return

    # New-style behaviour
    selected_sources = dict()
    if 'entities' not in source:
        if subitem:
            for s in subitem:
                # Add the specified subs
                selected_sources[s] = source[s]
        else:
            # All sub sources if no subs are specified
            selected_sources = source
    else:
        # Only one main source
        selected_sources = {source_id: source}

    # Processing each item
    for source_id, source in selected_sources.items():
        for item in source.get('entities'):
            if (not entiteit and item) or (entiteit and item.get('entity') in entiteit):
                new_source = deepcopy(source)
                new_source.update(item)
                setup_pipeline(new_source)
コード例 #5
0
ファイル: manage.py プロジェクト: ajslaghu/open-wob-api
def extract_list_sources(sources_config):
    """
    Show a list of available sources (preconfigured pipelines).

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    """
    sources = load_sources_config(sources_config)

    click.echo('Available sources:')
    for source in sources:
        click.echo(' - %s' % source['id'])
コード例 #6
0
def extract_list_sources(sources_config):
    """
    Show a list of available sources (preconfigured pipelines).

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    """
    sources = load_sources_config(sources_config)

    click.echo('Available sources:')
    for source in sources:
        click.echo(' - %s' % source['id'])
コード例 #7
0
def extract_check_sources(sources_config):
    """
    Check http fetch results from a list of available sources (preconfigured pipelines).

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    """
    sources = load_sources_config(sources_config)

    for source in sources:
        if 'file_url' not in source:
            continue
        if not re.match('https?://', source['file_url']):
            continue

        resp = requests.get(source['file_url'], timeout=10)
        click.echo('%s - %s' % (
            resp.statuscode,
            source['file_url'],
        ))
コード例 #8
0
ファイル: manage.py プロジェクト: ajslaghu/open-cultuur-data
def extract_start(source_id, sources_config):
    """Start extraction for a specified source."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    # Find the requested source defenition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
コード例 #9
0
def extract_start(source_id, sources_config):
    """Start extraction for a specified source."""
    if not sources_config:
        sources_config = SOURCES_CONFIG_FILE
    sources = load_sources_config(sources_config)

    # Find the requested source defenition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
コード例 #10
0
ファイル: manage.py プロジェクト: SLKTH/open-raadsinformatie
def extract_list_sources(sources_config):
    """
    Show a list of available sources (preconfigured pipelines).
    Old-style sources might show multiple entities.
    New-style sources will show only the name of the source

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    """
    sources = load_sources_config(sources_config)

    all_keys = list()
    for key, source in sources.items():
        all_keys.append(key)
        if 'id' not in source and 'entities' not in source:
            for sub_key in source.keys():
                all_keys.append('%s -s %s' % (key, sub_key))

    click.echo('Available sources:')
    for source in sorted(set(all_keys)):
        click.echo(' - %s' % source)
コード例 #11
0
def extract_start(source_id, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    """
    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
コード例 #12
0
ファイル: manage.py プロジェクト: ajslaghu/open-wob-api
def extract_start(source_id, sources_config):
    """
    Start extraction for a pipeline specified by ``source_id`` defined in
    ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``.

    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    :param source_id: identifier used in ``--sources_config`` to describe pipeline
    """
    sources = load_sources_config(sources_config)

    # Find the requested source definition in the list of available sources
    source = None
    for candidate_source in sources:
        if candidate_source['id'] == source_id:
            source = candidate_source
            continue

    # Without a config we can't do anything, notify the user and exit
    if not source:
        click.echo('Error: unable to find source with id "%s" in sources '
                   'config' % source_id)
        return

    setup_pipeline(source)
コード例 #13
0
def test_load_all_sources():
    sources = load_sources_config('ocd_backend/sources.json')
    for source in sources:
        yield _load_source, source
コード例 #14
0
def test_load_sources_config_open_file():
    with open('ocd_backend/sources.json') as json_file:
        sources = load_sources_config(json_file)
        assert isinstance(sources, list)
コード例 #15
0
def test_load_sources_config_filename():
    sources = load_sources_config('ocd_backend/sources.json')
    assert isinstance(sources, list)
コード例 #16
0
def test_load_all_sources():
    sources = load_sources_config('ocd_backend/sources.json')
    for source in sources:
        yield _load_source, source
コード例 #17
0
def test_load_sources_config_open_file():
    with open('ocd_backend/sources.json') as json_file:
        sources = load_sources_config(json_file)
        assert isinstance(sources, list)
コード例 #18
0
def test_load_sources_config_filename():
    sources = load_sources_config('ocd_backend/sources.json')
    assert isinstance(sources, list)
コード例 #19
0
def qa_matrix(index, sources_config):
    #mapping = es.indices.get_mapping(index=index)
    fields = BaseItem.combined_index_fields
    sources = load_sources_config(sources_config)

    #pprint(fields)

    all_body = {
        'query': {
            'constant_score': {
                'query': {
                    'match_all': {}
                }
            }
        },
        'aggs': {
            'source_id': {
                'terms': {
                    'field': 'meta.source_id',
                    'size': len(sources)
                },
            }
        },
        'size': 0
    }
    all_result = es.search(index=index, body=all_body)

    all_counts = {
        b['key']: {
            'all': b['doc_count']
        }
        for b in all_result['aggregations']['source_id']['buckets']
    }

    for field in fields:
        body = {
            'query': {
                'filtered': {
                    'query': {
                        'constant_score': {
                            'query': {
                                'match_all': {}
                            }
                        }
                    },
                    'filter': {
                        'exists': {
                            'field': field
                        }
                    }
                }
            },
            'aggs': {
                'source_id': {
                    'terms': {
                        'field': 'meta.source_id',
                        'size': len(sources)
                    },
                }
            },
            'size': 0
        }
        result = es.search(index=index, body=body)
        for b in result['aggregations']['source_id']['buckets']:
            all_counts[b['key']][field] = b['doc_count']
        # print field
        # pprint(result)

    pprint(all_counts)

    print("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] +
                                                        fields.keys())))
    #print "{:<8} {:<15} {:<10}".format('Source',)
    for source_id, counts in all_counts.iteritems():
        print("{:<24}" * (len(fields) + 2)).format(
            *([source_id, counts['all']] +
              [counts.get(c, 0) for c in sorted(fields)]))
コード例 #20
0
    def __init__(self, *args, **kwargs):
        super(DataSyncBaseExtractor, self).__init__(*args, **kwargs)

        self.sources = load_sources_config(self.source_definition['sources_config'])
        self.extractors = [self._init_extractor_from_source(s) for s in self.source_definition['sources']]
コード例 #21
0
def extract_process(modus, source_path, sources_config):
    """
    Start extraction based on the flags in Redis.
    It uses the source_path in Redis db 1 to identify which municipalities should be extracted.
    A municipality can be set using 'SET ori.ibabs.arnhem enabled'.
    Currently, possible values are: enabled, disabled and archived.

    :param modus: the configuration to use for processing, starting with an underscore. i.e. _enabled, _archived, _disabled. Looks for configuration in redis like _custom.start_date
    :param source_path: path in redis to search, i.e. ori.ibabs.arnhem. Defaults to *
    :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE``
    """
    redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=1)

    available_sources = load_sources_config(sources_config)
    redis_sources = redis_client.keys(source_path)

    sources = []
    for redis_source in redis_sources:
        if redis_source[0:1] == '_':
            # Settings are underscored so we skip them
            continue

        source_value = redis_client.get(redis_source)
        if source_value.startswith('disabled'):
            # If value equals disabled we will not process the source
            continue
        elif modus in source_value:
            sources.append(redis_source)

    if not redis_sources:
        click.echo('No sources found in redis')
        return
    elif not sources:
        click.echo('Redis sources found but non match the modus %s' % modus)
        return

    settings_path = '_%s.*' % modus
    setting_keys = redis_client.keys(settings_path)
    if not setting_keys:
        click.echo('No settings found in redis for %s' % settings_path)
        return

    settings = {}
    enabled_entities = []
    for key in setting_keys:
        _, _, name = key.rpartition('.')
        value = redis_client.get(key)
        if name == 'entities':
            enabled_entities = value.split(' ')
        else:
            settings[name] = value

    for source in sources:
        try:
            project, provider, source_name = source.split('.')
            available_source = available_sources['%s.%s' %
                                                 (project,
                                                  provider)][source_name]

            click.echo('[%s] Start extract for %s' %
                       (source_name, source_name))

            selected_entities = []
            for entity in available_source['entities']:
                if not enabled_entities or entity.get(
                        'entity') in enabled_entities:
                    selected_entities.append(entity.get('entity'))

                    # Redis settings are overruled by source definitions, for some sources a start_date must be enforced
                    new_source = deepcopy(settings)
                    new_source.update(deepcopy(available_source))
                    new_source.update(entity)

                    setup_pipeline.delay(new_source)

            click.echo('[%s] Started pipelines: %s' %
                       (source_name, ', '.join(selected_entities)))
        except ValueError:
            click.echo('Invalid source format %s in redis' % source)
        except KeyError:
            click.echo(
                'Source %s in redis does not exist in available sources' %
                source)