def on_dataset_delete(sender, **kwargs): """ When a Dataset is deleted, purge its data and metadata from Solr. """ dataset = kwargs["instance"] PurgeDataTask.apply_async(args=[dataset.slug]) solr.delete(settings.SOLR_DATASETS_CORE, "slug:%s" % dataset.slug)
def run(self, dataset_slug): log = logging.getLogger('panda.tasks.purge.data') log.info('Beginning purge, dataset_slug: %s' % dataset_slug) solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % dataset_slug) log.info('Finished purge, dataset_slug: %s' % dataset_slug)
def after_return(self, status, retval, task_id, args, kwargs, einfo): """ Save final status, results, etc. """ from panda.models import Dataset log = logging.getLogger(self.name) try: dataset = Dataset.objects.get(slug=args[0]) except Dataset.DoesNotExist: log.warning( 'Can not send reindexing notifications due to Dataset being deleted, dataset_slug: %s' % args[0]) return try: try: self.send_notifications(dataset, retval, einfo) finally: # If reindex failed, clear any data that might be staged if dataset.current_task.status == 'FAILURE': solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % args[0], commit=True) finally: dataset.unlock()
def test_delete(self): upload = utils.get_test_data_upload(self.user, self.dataset) upload_id = upload.id path = upload.get_path() self.assertEqual(os.path.isfile(path), True) solr.delete(settings.SOLR_DATA_CORE, '*:*') self.dataset.import_data(self.user, upload) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) upload = DataUpload.objects.get(id=upload_id) dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, upload) self.assertEqual(dataset.row_count, 4) upload.delete() # Ensure dataset still exists dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, None) self.assertEqual(dataset.row_count, 0) self.assertEqual(os.path.exists(path), False) with self.assertRaises(DataUpload.DoesNotExist): DataUpload.objects.get(id=upload_id) self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)
def test_delete(self): upload = utils.get_test_data_upload(self.user, self.dataset) upload_id = upload.id path = upload.get_path() self.assertEqual(os.path.isfile(path), True) solr.delete(settings.SOLR_DATA_CORE, '*:*') self.dataset.import_data(self.user, upload) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1) upload = DataUpload.objects.get(id=upload_id) dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, upload) self.assertEqual(dataset.row_count, 4) upload.delete() # Ensure dataset still exists dataset = Dataset.objects.get(id=self.dataset.id) self.assertEqual(dataset.initial_upload, None) self.assertEqual(dataset.row_count, 0) self.assertEqual(os.path.exists(path), False) with self.assertRaises(DataUpload.DoesNotExist): DataUpload.objects.get(id=upload_id) self.assertEqual( solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)
def delete_all_rows(self, user,): """ Delete all rows in this dataset. """ solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.slug, commit=True) old_row_count = self.row_count self.row_count = 0 self.last_modified = datetime.utcnow() self.last_modification = 'All %i rows deleted' % old_row_count self.save()
def delete_row(self, user, external_id): """ Delete a row in this dataset. """ solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND external_id:%s' % (self.slug, external_id), commit=True) self.row_count = self._count_rows() self.last_modified = datetime.utcnow() self.last_modified_by = user self.last_modification = '1 row deleted' self.save()
def delete(self, *args, **kwargs): """ Cancel any in progress task. """ # Cancel import if necessary if self.current_task: self.current_task.request_abort() # Cleanup data in Solr PurgeDataTask.apply_async(args=[self.slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.slug) super(Dataset, self).delete(*args, **kwargs)
def delete_all_rows(self, user): """ Delete all rows in this dataset. """ self.lock() try: solr.delete(settings.SOLR_DATA_CORE, "dataset_slug:%s" % self.slug, commit=True) old_row_count = self.row_count self.row_count = 0 self.last_modified = datetime.utcnow() self.last_modification = "All %i rows deleted" % old_row_count self.save() finally: self.unlock()
def delete_row(self, user, external_id): """ Delete a row in this dataset. """ self.lock() try: solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND external_id:%s' % (self.slug, external_id), commit=True) self.row_count = self._count_rows() self.last_modified = now() self.last_modified_by = user self.last_modification = _('1 row deleted') self.save() finally: self.unlock()
def delete_all_rows(self, user,): """ Delete all rows in this dataset. """ self.lock() try: solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.slug, commit=True) old_row_count = self.row_count self.row_count = 0 self.last_modified = now() self.last_modification = _('All %i rows deleted') % old_row_count or 0 self.save() finally: self.unlock()
def after_return(self, status, retval, task_id, args, kwargs, einfo): """ Save final status, results, etc. """ from panda.models import Dataset, Notification dataset = Dataset.objects.get(slug=args[0]) task_status = dataset.current_task if einfo: self.task_exception( task_status, 'Import failed', u'\n'.join([einfo.traceback, unicode(retval)]) ) email_subject = 'Import failed: %s' % dataset.name email_message = 'Import failed: %s:\n\nhttp://%s/#dataset/%s' % (dataset.name, config_value('DOMAIN', 'SITE_DOMAIN'), dataset.slug) notification_message = 'Import failed: <strong>%s</strong>' % dataset.name notification_type = 'Error' elif self.is_aborted(): email_subject = 'Import aborted: %s' % dataset.name email_message = 'Import aborted: %s:\n\nhttp://%s/#dataset/%s' % (dataset.name, config_value('DOMAIN', 'SITE_DOMAIN'), dataset.slug) notification_message = 'Import aborted: <strong>%s</strong>' % dataset.name notification_type = 'Info' else: self.task_complete(task_status, 'Import complete') email_subject = 'Import complete: %s' % dataset.name email_message = 'Import complete: %s:\n\nhttp://%s/#dataset/%s' % (dataset.name, config_value('DOMAIN', 'SITE_DOMAIN'), dataset.slug) notification_message = 'Import complete: <strong>%s</strong>' % dataset.name notification_type = 'Info' if task_status.creator: Notification.objects.create( recipient=task_status.creator, related_task=task_status, related_dataset=dataset, message=notification_message, type=notification_type ) send_mail(email_subject, email_message, [task_status.creator.username]) # If import failed, clear any data that might be staged if task_status.status == 'FAILURE': solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % args[0], commit=True)
def test_change_user_reindex(self): solr.delete(settings.SOLR_DATASETS_CORE, '*:*') self.user.first_name = 'bazbarfoo' self.user.save() dataset = utils.get_test_dataset(self.user) upload = utils.get_test_data_upload(self.user, dataset) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1) old_name = dataset.creator.first_name dataset.creator.first_name = 'foobarbaz' dataset.creator.save() self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, old_name)['response']['numFound'], 0) self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
def after_return(self, status, retval, task_id, args, kwargs, einfo): """ Save final status, results, etc. """ from panda.models import Dataset dataset = Dataset.objects.get(slug=args[0]) try: try: self.send_notifications(dataset, retval, einfo) finally: # If import failed, clear any data that might be staged if dataset.current_task.status == "FAILURE": solr.delete(settings.SOLR_DATA_CORE, "dataset_slug:%s" % args[0], commit=True) finally: dataset.unlock()
def delete(self, *args, **kwargs): """ Cancel any in progress task. """ # Cancel import if necessary if self.current_task: self.current_task.request_abort() # Manually delete related uploads so their delete method is called for upload in chain(self.data_uploads.all(), self.related_uploads.all()): upload.delete(skip_purge=True) # Cleanup data in Solr PurgeDataTask.apply_async(args=[self.slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.slug) super(Dataset, self).delete(*args, **kwargs)
def delete(self, *args, **kwargs): """ Cancel any in progress task. """ # Cancel import if necessary if self.current_task: self.current_task.request_abort() # Manually delete related uploads so their delete method is called for upload in self.data_uploads.all(): upload.delete(skip_purge=True, force=True) for upload in self.related_uploads.all(): upload.delete() # Cleanup data in Solr PurgeDataTask.apply_async(args=[self.slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.slug) super(Dataset, self).delete(*args, **kwargs)
def delete_all_rows( self, user, ): """ Delete all rows in this dataset. """ self.lock() try: solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.slug, commit=True) old_row_count = self.row_count self.row_count = 0 self.last_modified = now() self.last_modification = 'All %i rows deleted' % old_row_count or 0 self.save() finally: self.unlock()
def run(self, dataset_slug, data_upload_id=None): from panda.models import Dataset log = logging.getLogger(self.name) log.info('Beginning purge, dataset_slug: %s' % dataset_slug) if data_upload_id: q = 'data_upload_id:%i' % data_upload_id else: q = 'dataset_slug:%s' % dataset_slug solr.delete(settings.SOLR_DATA_CORE, q) try: # If the dataset hasn't been deleted, update its row count dataset = Dataset.objects.get(slug=dataset_slug) dataset.row_count = dataset._count_rows() dataset.save() except Dataset.DoesNotExist: pass log.info('Finished purge, dataset_slug: %s' % dataset_slug)
def after_return(self, status, retval, task_id, args, kwargs, einfo): """ Save final status, results, etc. """ from panda.models import Dataset log = logging.getLogger(self.name) try: dataset = Dataset.objects.get(slug=args[0]) except Dataset.DoesNotExist: log.warning('Can not send reindexing notifications due to Dataset being deleted, dataset_slug: %s' % args[0]) return try: try: self.send_notifications(dataset, retval, einfo) finally: # If reindex failed, clear any data that might be staged if dataset.current_task.status == 'FAILURE': solr.delete(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % args[0], commit=True) finally: dataset.unlock()
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) try: query = '(%s)' % request.GET['q'] except KeyError: query = '' category = request.GET.get('category', '') since = request.GET.get('since', None) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int(request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) export = bool(request.GET.get('export', False)) solr_query_bits = [query] if category: if category != 'uncategorized': category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list('slug', flat=True) else: dataset_slugs = Dataset.objects.filter(categories=None).values_list('slug', flat=True) solr_query_bits.append('dataset_slug:(%s)' % ' '.join(dataset_slugs)) if since: solr_query_bits.append('last_modified:[' + since + 'Z TO *]') # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description=_('Export search results for "%s".') % query, creator=user ) task_type.apply_async( args=[query, task.id], kwargs={}, task_id=task.id ) else: response = solr.query_grouped( settings.SOLR_DATA_CORE, ' AND '.join(solr_query_bits), 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset ) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups'] ).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % dataset_slug) page['meta']['total_count'] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound'] ).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, _('Export queued.')) else: return self.create_response(request, page)
def setup_test_solr(): settings.SOLR_DATA_CORE = 'data_test' settings.SOLR_DATASETS_CORE = 'datasets_test' config_get('PERF', 'TASK_THROTTLE').update(0.0) solr.delete(settings.SOLR_DATA_CORE, '*:*') solr.delete(settings.SOLR_DATASETS_CORE, '*:*')
def setup_test_solr(): settings.SOLR_DATA_CORE = 'data_test' settings.SOLR_DATASETS_CORE = 'datasets_test' solr.delete(settings.SOLR_DATA_CORE, '*:*') solr.delete(settings.SOLR_DATASETS_CORE, '*:*')
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get('q', '') category = request.GET.get('category', '') since = request.GET.get('since', None) limit = int( request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int( request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) export = bool(request.GET.get('export', False)) if category: if category != 'uncategorized': category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list('slug', flat=True) else: dataset_slugs = Dataset.objects.filter( categories=None).values_list('slug', flat=True) query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs) if since: query = 'last_modified:[' + since + 'Z TO *] AND (%s)' % query # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description='Export search results for "%s".' % query, creator=user) task_type.apply_async(args=[query, task.id], kwargs={}, task_id=task.id) else: response = solr.query_grouped(settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups']).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % dataset_slug) page['meta']['total_count'] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate( dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle( dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound']).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, 'Export queued.') else: return self.create_response(request, page)
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=["get"]) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get("q", "") category = request.GET.get("category", "") since = request.GET.get("since", None) limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get("offset", 0)) group_limit = int(request.GET.get("group_limit", settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get("group_offset", 0)) export = bool(request.GET.get("export", False)) if category: if category != "uncategorized": category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list("slug", flat=True) else: dataset_slugs = Dataset.objects.filter(categories=None).values_list("slug", flat=True) query += " dataset_slug:(%s)" % " ".join(dataset_slugs) if since: query = "last_modified:[" + since + "Z TO *] AND (%s)" % query # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description='Export search results for "%s".' % query, creator=user ) task_type.apply_async(args=[query, task.id], kwargs={}, task_id=task.id) else: response = solr.query_grouped( settings.SOLR_DATA_CORE, query, "dataset_slug", offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset, ) groups = response["grouped"]["dataset_slug"]["groups"] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response["grouped"]["dataset_slug"]["ngroups"], ).page() datasets = [] for group in groups: dataset_slug = group["groupValue"] results = group["doclist"] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, "slug:%s" % dataset_slug) page["meta"]["total_count"] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results["docs"]] dataset_search_url = reverse( "api_dataset_data_list", kwargs={ "api_name": self._meta.api_name, "dataset_resource_name": "dataset", "resource_name": "data", "dataset_slug": dataset.slug, }, ) data_page = PandaPaginator( {"limit": str(group_limit), "offset": str(group_offset), "q": query}, objects, resource_uri=dataset_search_url, count=results["numFound"], ).page() dataset_bundle.data.update(data_page) dataset_bundle.data["objects"] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data["objects"].append(data_bundle) datasets.append(dataset_bundle.data) page["objects"] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, "Export queued.") else: return self.create_response(request, page)