def main() -> None: utils.setup_logger(__file__) logging.info('aggregate_signals.py started') signal_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SIGNALS')) signal_aggregated_collection = SolrCollection( os.getenv('SOLR_COLLECTION_SIGNALS_AGGREGATED')) signals = signal_collection.select_all_documents() aggregated_signals = signal_aggregated_collection.select_all_documents() signal_aggregated_collection.index_documents( get_aggregations(aggregate_fields(signals, ['query', 'handler']), aggregated_signals, 'query')) signal_aggregated_collection.index_documents( get_aggregations( aggregate_fields( reduce_date_field_to_hours(signals, 'search_timestamp'), ['search_timestamp', 'handler']), aggregated_signals, 'search_timestamp')) signal_aggregated_collection.index_documents( get_aggregations( aggregate_fields(preprocess_filters(signals), ['filters', 'handler']), aggregated_signals, 'filters')) signal_collection.delete_documents('*:*') logging.info('aggregate_signals.py finished')
def main(): utils.setup_logger(__file__) logging.info('update_relations_with_object_property.py -- starting') search_collection = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH')) property_to_relation = utils.load_resource('property_to_relation') updates = [] for object_type, mapping in property_to_relation.items(): object_uri_to_source_mapping = get_object_uri_to_source_mapping( search_collection, object_type, mapping['source']) for relation in mapping['relations']: relation_objects = search_collection.select_all_documents( 'sys_type:{0} AND {1}:[* TO *]'.format(relation['type'], relation['match']), ['sys_uri', relation['match']], id_field='sys_id') updates += get_relation_updates(relation, mapping, object_uri_to_source_mapping, relation_objects) search_collection.index_documents(updates) logging.info('update_relations_with_object_property.py -- finished')
def main() -> None: utils.setup_logger(__file__) parser = argparse.ArgumentParser(description='Deletes old donl signals') parser.add_argument('--number_of_days', type=int, default=30, help='Specify the number of days after which signals ' 'are considered old') input_arguments = vars(parser.parse_args()) logging.info('rotate_signals.py started') collection = SolrCollection(os.getenv('SOLR_COLLECTION_SIGNALS')) old_signals_query = 'search_timestamp:[* TO NOW-{0}DAYS]'.format( input_arguments['number_of_days']) logging.info('deleting {0} signals that are older than {1} days'.format( collection.document_count(old_signals_query), input_arguments['number_of_days'])) collection.delete_documents(old_signals_query) logging.info('rotate_signals.py finished')
def main(): utils.setup_logger(__file__) logging.info('generate_relations.py -- starting') collection = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH')) update_reverse_relations(collection) logging.info('committing index changes') collection.index_documents([], commit=True) update_relations(collection) logging.info('committing index changes') collection.index_documents([], commit=True) update_authority_kind(collection) logging.info('committing index changes') collection.index_documents([], commit=True) update_popularity(collection) logging.info('committing index changes') collection.index_documents([], commit=True) logging.info('generate_relations.py -- finished')
def main() -> None: utils.setup_logger(__file__) logging.info('list_downloader.py started') lists = utils.load_resource('lists') update_vocabularies(lists['vocabularies']) update_taxonomies(lists['taxonomies']) logging.info('list_downloader.py finished')
def main() -> None: resources = { 'stopwords_nl': manage_stopwords_nl, 'stopwords_en': manage_stopwords_en, 'labels_nl': manage_label_synonyms_nl, 'labels_en': manage_label_synonyms_en, 'uri_synonyms': manage_uri_synonyms, 'hierarchy_theme': manage_hierarchy_theme } utils.setup_logger(__file__) logging.info('managed_resource.py -- starting') parser = argparse.ArgumentParser(description='Maintain the Solr managed ' 'resources.') parser.add_argument('--collection', type=str, required=True, choices=solr_collections(), help='Which collection to manage the resource for') parser.add_argument('--resource', type=str, choices=resources.keys(), help='Which resource to manage', required=True) parser.add_argument('--reload', type=bool, nargs='?', default=False, const=True, help='To reload the collection afterwards') input_arguments = vars(parser.parse_args()) collection = SolrCollection(input_arguments['collection']) resources.get(input_arguments['resource'])(collection) if input_arguments['reload']: logging.info('reloading Solr collection') collection.reload() logging.info('managed_resource.py -- finished')
def main() -> None: utils.setup_logger(__file__) parser = argparse.ArgumentParser(description='Reloads a Solr collection') parser.add_argument('--collection', type=str, required=True, choices=solr_collections(), help='Which collection to ' 'reload') input_arguments = vars(parser.parse_args()) logging.info('reload_collection.py -- starting') logging.info(' > collection: %s', input_arguments['collection']) collection = SolrCollection(input_arguments['collection']) collection.reload() logging.info('reload_collection.py -- finished')
def main() -> None: utils.setup_logger(__file__) logging.info('generate_suggestions.py -- starting') suggest = SolrCollection(os.getenv('SOLR_COLLECTION_SUGGESTER')) search = SolrCollection(os.getenv('SOLR_COLLECTION_SEARCH')) logging.info('clearing suggestions') suggest.delete_documents('*:*', commit=False) relation_counts = search.get_facet_counts('relation') community_uri_to_name = search.select_all_documents( fq='sys_type:community', fl=['sys_uri', 'sys_name'], id_field='sys_id' ) community_uri_to_name = {community['sys_uri']: community['sys_name'] for community in community_uri_to_name} suggestion_types = utils.load_resource('suggestions') doc_suggestions = {doc_type: get_doc_suggestions( search, doc_type, config['mapping'], relation_counts, community_uri_to_name) for doc_type, config in suggestion_types.items()} logging.info('adding title suggestions:') for doc_type, doc_type_suggestions in doc_suggestions.items(): suggest.index_documents(doc_type_suggestions, commit=False) logging.info(' titles: %s of type %s', len(doc_type_suggestions), doc_type) user_defined_synonym_suggestions = {doc_type: get_doc_suggestions( search, doc_type, config['user_defined_synonyms'], relation_counts, community_uri_to_name, 'user_defined_synonyms:[* TO *]') for doc_type, config in suggestion_types.items() if 'user_defined_synonyms' in config} logging.info('adding user defined synonym suggestions:') for doc_type, doc_type_suggestions in user_defined_synonym_suggestions.items(): suggest.index_documents(doc_type_suggestions, commit=False) logging.info(' user defined synonyms: %s of type %s', len(doc_type_suggestions), doc_type) context_suggestions = { doc_type: { relation: get_suggestions(search, doc_type, relation, suggestion_types[relation]['mapping'], community_uri_to_name) } for doc_type, config in suggestion_types.items() for relation in config['relations']} logging.info('adding context suggestions:') for doc_type, relations in context_suggestions.items(): for relation, suggestions in relations.items(): suggest.index_documents(suggestions, commit=False) logging.info(' titles: %s of type %s in context of %s', len(suggestions), relation, doc_type) logging.info('adding theme suggestions:') theme_suggestions = get_theme_suggestions(search, 'dataset') suggest.index_documents(theme_suggestions, commit=False) logging.info(' themes: %s in context of %s', len(theme_suggestions), 'dataset') logging.info('committing changes to index') suggest.index_documents([], commit=True) logging.info('building Solr suggester') suggest.build_suggestions('build_suggest') logging.info('generate_suggestions.py -- finished')