def qa_matrix(index, sources_config): #mapping = es.indices.get_mapping(index=index) fields = BaseItem.combined_index_fields sources = load_sources_config(sources_config) #pprint(fields) all_body = { 'query': { 'constant_score': { 'query': {'match_all': {}} } }, 'aggs': { 'source_id': { 'terms': {'field': 'meta.source_id', 'size': len(sources)}, } }, 'size': 0 } all_result = es.search(index=index, body=all_body) all_counts = {b['key']: { 'all': b['doc_count']} for b in all_result['aggregations']['source_id']['buckets']} for field in fields: body = { 'query': { 'filtered': { 'query': { 'constant_score': { 'query': {'match_all': {}} } }, 'filter': { 'exists': {'field': field} } } }, 'aggs': { 'source_id': { 'terms': {'field': 'meta.source_id', 'size': len(sources)}, } }, 'size': 0 } result = es.search(index=index, body=body) for b in result['aggregations']['source_id']['buckets']: all_counts[b['key']][field] = b['doc_count'] # print field # pprint(result) pprint(all_counts) print ("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] + fields.keys()))) #print "{:<8} {:<15} {:<10}".format('Source',) for source_id, counts in all_counts.iteritems(): print ("{:<24}" * (len(fields) + 2)).format(*([source_id, counts['all']] + [counts.get(c, 0) for c in sorted(fields)]))
def extract_list_sources(sources_config): """Show a list of available sources.""" if not sources_config: sources_config = SOURCES_CONFIG_FILE sources = load_sources_config(sources_config) click.echo('Available sources:') for source in sources: click.echo(' - %s' % source['id'])
def extract_start(source_id, subitem, entiteit, sources_config): """ Start extraction for a pipeline specified by ``source_id`` defined in ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``. When ``id`` is specified in the source it will trigger old-style json behaviour for backward compatibility reasons. Otherwise new-style yaml is assumed, which looks for ``entities`` in the source to determine the order in which entities are processed. If no ``entities`` are found in the source, all subitems of the source will be processed, if any. If one or more ``--subitem`` is specified, only those subitems will be processed. When one or more ``--entiteit`` is specified, only those entities will be processed. By default, all entities are processed. Note: ``--subitem`` and ``--entiteit`` only work in new-style yaml configurations. :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` :param source_id: identifier used in ``--sources_config`` to describe pipeline :param subitem: one ore more items under the parent `source_id`` to specify which subitems should be run :param entiteit: one ore more entity arguments to specify which entities should be run """ sources = load_sources_config(sources_config) # Find the requested source definition in the list of available sources source = sources.get(source_id) # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return # Check for old-style json sources if 'id' in source: setup_pipeline(source) return # New-style behaviour selected_sources = dict() if 'entities' not in source: if subitem: for s in subitem: # Add the specified subs selected_sources[s] = source[s] else: # All sub sources if no subs are specified selected_sources = source else: # Only one main source selected_sources = {source_id: source} # Processing each item for source_id, source in selected_sources.items(): for item in source.get('entities'): if (not entiteit and item) or (entiteit and item.get('entity') in entiteit): new_source = deepcopy(source) new_source.update(item) setup_pipeline(new_source)
def extract_list_sources(sources_config): """ Show a list of available sources (preconfigured pipelines). :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` """ sources = load_sources_config(sources_config) click.echo('Available sources:') for source in sources: click.echo(' - %s' % source['id'])
def extract_check_sources(sources_config): """ Check http fetch results from a list of available sources (preconfigured pipelines). :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` """ sources = load_sources_config(sources_config) for source in sources: if 'file_url' not in source: continue if not re.match('https?://', source['file_url']): continue resp = requests.get(source['file_url'], timeout=10) click.echo('%s - %s' % ( resp.statuscode, source['file_url'], ))
def extract_start(source_id, sources_config): """Start extraction for a specified source.""" if not sources_config: sources_config = SOURCES_CONFIG_FILE sources = load_sources_config(sources_config) # Find the requested source defenition in the list of available sources source = None for candidate_source in sources: if candidate_source['id'] == source_id: source = candidate_source continue # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return setup_pipeline(source)
def extract_list_sources(sources_config): """ Show a list of available sources (preconfigured pipelines). Old-style sources might show multiple entities. New-style sources will show only the name of the source :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` """ sources = load_sources_config(sources_config) all_keys = list() for key, source in sources.items(): all_keys.append(key) if 'id' not in source and 'entities' not in source: for sub_key in source.keys(): all_keys.append('%s -s %s' % (key, sub_key)) click.echo('Available sources:') for source in sorted(set(all_keys)): click.echo(' - %s' % source)
def extract_start(source_id, sources_config): """ Start extraction for a pipeline specified by ``source_id`` defined in ``--sources-config``. ``--sources-config defaults to ``settings.SOURCES_CONFIG_FILE``. :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` :param source_id: identifier used in ``--sources_config`` to describe pipeline """ sources = load_sources_config(sources_config) # Find the requested source definition in the list of available sources source = None for candidate_source in sources: if candidate_source['id'] == source_id: source = candidate_source continue # Without a config we can't do anything, notify the user and exit if not source: click.echo('Error: unable to find source with id "%s" in sources ' 'config' % source_id) return setup_pipeline(source)
def test_load_all_sources(): sources = load_sources_config('ocd_backend/sources.json') for source in sources: yield _load_source, source
def test_load_sources_config_open_file(): with open('ocd_backend/sources.json') as json_file: sources = load_sources_config(json_file) assert isinstance(sources, list)
def test_load_sources_config_filename(): sources = load_sources_config('ocd_backend/sources.json') assert isinstance(sources, list)
def qa_matrix(index, sources_config): #mapping = es.indices.get_mapping(index=index) fields = BaseItem.combined_index_fields sources = load_sources_config(sources_config) #pprint(fields) all_body = { 'query': { 'constant_score': { 'query': { 'match_all': {} } } }, 'aggs': { 'source_id': { 'terms': { 'field': 'meta.source_id', 'size': len(sources) }, } }, 'size': 0 } all_result = es.search(index=index, body=all_body) all_counts = { b['key']: { 'all': b['doc_count'] } for b in all_result['aggregations']['source_id']['buckets'] } for field in fields: body = { 'query': { 'filtered': { 'query': { 'constant_score': { 'query': { 'match_all': {} } } }, 'filter': { 'exists': { 'field': field } } } }, 'aggs': { 'source_id': { 'terms': { 'field': 'meta.source_id', 'size': len(sources) }, } }, 'size': 0 } result = es.search(index=index, body=body) for b in result['aggregations']['source_id']['buckets']: all_counts[b['key']][field] = b['doc_count'] # print field # pprint(result) pprint(all_counts) print("{:<24}" * (len(fields) + 2)).format(*(sorted(['Source', 'all'] + fields.keys()))) #print "{:<8} {:<15} {:<10}".format('Source',) for source_id, counts in all_counts.iteritems(): print("{:<24}" * (len(fields) + 2)).format( *([source_id, counts['all']] + [counts.get(c, 0) for c in sorted(fields)]))
def __init__(self, *args, **kwargs): super(DataSyncBaseExtractor, self).__init__(*args, **kwargs) self.sources = load_sources_config(self.source_definition['sources_config']) self.extractors = [self._init_extractor_from_source(s) for s in self.source_definition['sources']]
def extract_process(modus, source_path, sources_config): """ Start extraction based on the flags in Redis. It uses the source_path in Redis db 1 to identify which municipalities should be extracted. A municipality can be set using 'SET ori.ibabs.arnhem enabled'. Currently, possible values are: enabled, disabled and archived. :param modus: the configuration to use for processing, starting with an underscore. i.e. _enabled, _archived, _disabled. Looks for configuration in redis like _custom.start_date :param source_path: path in redis to search, i.e. ori.ibabs.arnhem. Defaults to * :param sources_config: Path to file containing pipeline definitions. Defaults to the value of ``settings.SOURCES_CONFIG_FILE`` """ redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=1) available_sources = load_sources_config(sources_config) redis_sources = redis_client.keys(source_path) sources = [] for redis_source in redis_sources: if redis_source[0:1] == '_': # Settings are underscored so we skip them continue source_value = redis_client.get(redis_source) if source_value.startswith('disabled'): # If value equals disabled we will not process the source continue elif modus in source_value: sources.append(redis_source) if not redis_sources: click.echo('No sources found in redis') return elif not sources: click.echo('Redis sources found but non match the modus %s' % modus) return settings_path = '_%s.*' % modus setting_keys = redis_client.keys(settings_path) if not setting_keys: click.echo('No settings found in redis for %s' % settings_path) return settings = {} enabled_entities = [] for key in setting_keys: _, _, name = key.rpartition('.') value = redis_client.get(key) if name == 'entities': enabled_entities = value.split(' ') else: settings[name] = value for source in sources: try: project, provider, source_name = source.split('.') available_source = available_sources['%s.%s' % (project, provider)][source_name] click.echo('[%s] Start extract for %s' % (source_name, source_name)) selected_entities = [] for entity in available_source['entities']: if not enabled_entities or entity.get( 'entity') in enabled_entities: selected_entities.append(entity.get('entity')) # Redis settings are overruled by source definitions, for some sources a start_date must be enforced new_source = deepcopy(settings) new_source.update(deepcopy(available_source)) new_source.update(entity) setup_pipeline.delay(new_source) click.echo('[%s] Started pipelines: %s' % (source_name, ', '.join(selected_entities))) except ValueError: click.echo('Invalid source format %s in redis' % source) except KeyError: click.echo( 'Source %s in redis does not exist in available sources' % source)