def run(self, query, task_status_id, filename=None, *args, **kwargs): """ Execute export. """ from panda.models import Dataset, TaskStatus log = logging.getLogger(self.name) log.info('Beginning export, query: %s' % query) task_status = TaskStatus.objects.get(id=task_status_id) task_status.begin('Preparing to import') if not filename: filename = 'search_export_%s' % (now().isoformat()) zip_name = '%s.zip' % filename path = os.path.join(settings.EXPORT_ROOT, filename) zip_path = os.path.join(settings.EXPORT_ROOT, zip_name) try: os.makedirs(os.path.realpath(path)) except: pass zipfile = ZipFile(zip_path, 'w') response = solr.query_grouped(settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=0, limit=1000, group_limit=0, group_offset=0) groups = response['grouped']['dataset_slug']['groups'] datasets = {} for group in groups: dataset_slug = group['groupValue'] count = group['doclist']['numFound'] datasets[dataset_slug] = count total_n = 0 throttle = config_value('PERF', 'TASK_THROTTLE') for dataset_slug in datasets: try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning( 'Skipping part of export due to Dataset being deleted, dataset_slug: %s' % dataset_slug) continue filename = '%s.csv' % dataset_slug file_path = os.path.join(path, filename) f = open(file_path, 'w') writer = CSVKitWriter(f) # Header writer.writerow([c['name'] for c in dataset.column_schema]) response = solr.query(settings.SOLR_DATA_CORE, query, offset=0, limit=0) # Update dataset and total counts for progress tracking datasets[dataset_slug] = response['response']['numFound'] total_count = sum(datasets.values()) n = 0 while n < datasets[dataset_slug]: response = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug: %s %s' % (dataset_slug, query), offset=n, limit=SOLR_PAGE_SIZE) results = response['response']['docs'] for row in results: data = json.loads(row['data']) writer.writerow(data) task_status.update( '%.0f%% complete' % floor(float(total_n) / float(total_count) * 100)) if self.is_aborted(): task_status.abort( 'Aborted after exporting %.0f%%' % floor(float(total_n) / float(total_count) * 100)) log.warning('Export aborted, query: %s' % query) return n += SOLR_PAGE_SIZE total_n += response['response']['numFound'] time.sleep(throttle) f.close() # Add to zip and nuke temp file zipfile.write(file_path, filename) os.remove(file_path) # Finish zip file and nuke temp directory zipfile.close() os.rmdir(path) task_status.update('100% complete') log.info('Finished export, query: %s' % query) return zip_name
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=["get"]) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get("q", "") category = request.GET.get("category", "") since = request.GET.get("since", None) limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get("offset", 0)) group_limit = int(request.GET.get("group_limit", settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get("group_offset", 0)) export = bool(request.GET.get("export", False)) if category: if category != "uncategorized": category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list("slug", flat=True) else: dataset_slugs = Dataset.objects.filter(categories=None).values_list("slug", flat=True) query += " dataset_slug:(%s)" % " ".join(dataset_slugs) if since: query = "last_modified:[" + since + "Z TO *] AND (%s)" % query # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description='Export search results for "%s".' % query, creator=user ) task_type.apply_async(args=[query, task.id], kwargs={}, task_id=task.id) else: response = solr.query_grouped( settings.SOLR_DATA_CORE, query, "dataset_slug", offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset, ) groups = response["grouped"]["dataset_slug"]["groups"] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response["grouped"]["dataset_slug"]["ngroups"], ).page() datasets = [] for group in groups: dataset_slug = group["groupValue"] results = group["doclist"] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, "slug:%s" % dataset_slug) page["meta"]["total_count"] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results["docs"]] dataset_search_url = reverse( "api_dataset_data_list", kwargs={ "api_name": self._meta.api_name, "dataset_resource_name": "dataset", "resource_name": "data", "dataset_slug": dataset.slug, }, ) data_page = PandaPaginator( {"limit": str(group_limit), "offset": str(group_offset), "q": query}, objects, resource_uri=dataset_search_url, count=results["numFound"], ).page() dataset_bundle.data.update(data_page) dataset_bundle.data["objects"] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data["objects"].append(data_bundle) datasets.append(dataset_bundle.data) page["objects"] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, "Export queued.") else: return self.create_response(request, page)
def run(self, query, task_status_id, filename=None, *args, **kwargs): """ Execute export. """ from panda.models import Dataset, TaskStatus log = logging.getLogger(self.name) log.info('Beginning export, query: %s' % query) task_status = TaskStatus.objects.get(id=task_status_id) task_status.begin('Preparing to import') if not filename: filename = 'search_export_%s' % (now().isoformat()) zip_name = '%s.zip' % filename path = os.path.join(settings.EXPORT_ROOT, filename) zip_path = os.path.join(settings.EXPORT_ROOT, zip_name) try: os.makedirs(os.path.realpath(path)) except: pass zipfile = ZipFile(zip_path, 'w') response = solr.query_grouped( settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=0, limit=1000, group_limit=0, group_offset=0 ) groups = response['grouped']['dataset_slug']['groups'] datasets = {} for group in groups: dataset_slug = group['groupValue'] count = group['doclist']['numFound'] datasets[dataset_slug] = count total_n = 0 throttle = config_value('PERF', 'TASK_THROTTLE') for dataset_slug in datasets: try: dataset = Dataset.objects.get(slug=dataset_slug) except Dataset.DoesNotExist: log.warning('Skipping part of export due to Dataset being deleted, dataset_slug: %s' % dataset_slug) continue filename = '%s.csv' % dataset_slug file_path = os.path.join(path, filename) f = open(file_path, 'w') writer = CSVKitWriter(f) # Header writer.writerow([c['name'] for c in dataset.column_schema]) response = solr.query( settings.SOLR_DATA_CORE, query, offset=0, limit=0 ) # Update dataset and total counts for progress tracking datasets[dataset_slug] = response['response']['numFound'] total_count = sum(datasets.values()) n = 0 while n < datasets[dataset_slug]: response = solr.query( settings.SOLR_DATA_CORE, 'dataset_slug: %s %s' % (dataset_slug, query), offset=n, limit=SOLR_PAGE_SIZE ) results = response['response']['docs'] for row in results: data = json.loads(row['data']) writer.writerow(data) task_status.update('%.0f%% complete' % floor(float(total_n) / float(total_count) * 100)) if self.is_aborted(): task_status.abort('Aborted after exporting %.0f%%' % floor(float(total_n) / float(total_count) * 100)) log.warning('Export aborted, query: %s' % query) return n += SOLR_PAGE_SIZE total_n += response['response']['numFound'] time.sleep(throttle) f.close() # Add to zip and nuke temp file zipfile.write(file_path, filename) os.remove(file_path) # Finish zip file and nuke temp directory zipfile.close() os.rmdir(path) task_status.update('100% complete') log.info('Finished export, query: %s' % query) return zip_name
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get('q', '') limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int(request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) response = solr.query_grouped( settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset ) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups'] ).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] dataset_resource = DatasetResource() dataset = Dataset.objects.get(slug=dataset_slug) dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound'] ).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets self.log_throttled_access(request) return self.create_response(request, page)
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get('q', '') category = request.GET.get('category', '') since = request.GET.get('since', None) limit = int( request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int( request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) export = bool(request.GET.get('export', False)) if category: if category != 'uncategorized': category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list('slug', flat=True) else: dataset_slugs = Dataset.objects.filter( categories=None).values_list('slug', flat=True) query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs) if since: query = 'last_modified:[' + since + 'Z TO *] AND (%s)' % query # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description='Export search results for "%s".' % query, creator=user) task_type.apply_async(args=[query, task.id], kwargs={}, task_id=task.id) else: response = solr.query_grouped(settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups']).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % dataset_slug) page['meta']['total_count'] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate( dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle( dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound']).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, 'Export queued.') else: return self.create_response(request, page)
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=["get"]) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get("q", "") limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get("offset", 0)) group_limit = int(request.GET.get("group_limit", settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get("group_offset", 0)) response = solr.query_grouped( settings.SOLR_DATA_CORE, query, "dataset_slug", offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset, ) groups = response["grouped"]["dataset_slug"]["groups"] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response["grouped"]["dataset_slug"]["ngroups"] ).page() datasets = [] for group in groups: dataset_slug = group["groupValue"] results = group["doclist"] dataset_resource = DatasetResource() dataset = Dataset.objects.get(slug=dataset_slug) dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results["docs"]] dataset_search_url = reverse( "api_dataset_data_list", kwargs={ "api_name": self._meta.api_name, "dataset_resource_name": "dataset", "resource_name": "data", "dataset_slug": dataset.slug, }, ) data_page = PandaPaginator( {"limit": str(group_limit), "offset": str(group_offset), "q": query}, objects, resource_uri=dataset_search_url, count=results["numFound"], ).page() dataset_bundle.data.update(data_page) dataset_bundle.data["objects"] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data["objects"].append(data_bundle) datasets.append(dataset_bundle.data) page["objects"] = datasets self.log_throttled_access(request) return self.create_response(request, page)
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) query = request.GET.get('q', '') category = request.GET.get('category', '') since = request.GET.get('since', None) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int(request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) if category: if category != 'uncategorized': category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list('slug', flat=True) else: dataset_slugs = Dataset.objects.filter(categories=None).values_list('slug', flat=True) query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs) if since: query = 'last_modified:[' + since + 'Z TO *] AND (%s)' % query response = solr.query_grouped( settings.SOLR_DATA_CORE, query, 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset ) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups'] ).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] dataset_resource = DatasetResource() dataset = Dataset.objects.get(slug=dataset_slug) dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound'] ).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets # Log query SearchLog.objects.create(user=request.user, dataset=None, query=query) self.log_throttled_access(request) return self.create_response(request, page)
def search_all_data(self, request, **kwargs): """ List endpoint using Solr. Provides full-text search via the "q" parameter." """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) try: query = '(%s)' % request.GET['q'] except KeyError: query = '' category = request.GET.get('category', '') since = request.GET.get('since', None) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) group_limit = int(request.GET.get('group_limit', settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP)) group_offset = int(request.GET.get('group_offset', 0)) export = bool(request.GET.get('export', False)) solr_query_bits = [query] if category: if category != 'uncategorized': category = Category.objects.get(slug=category) dataset_slugs = category.datasets.values_list('slug', flat=True) else: dataset_slugs = Dataset.objects.filter(categories=None).values_list('slug', flat=True) solr_query_bits.append('dataset_slug:(%s)' % ' '.join(dataset_slugs)) if since: solr_query_bits.append('last_modified:[' + since + 'Z TO *]') # Because users may have authenticated via headers the request.user may # not be a full User instance. To be sure, we fetch one. user = UserProxy.objects.get(id=request.user.id) if export: task_type = ExportSearchTask task = TaskStatus.objects.create( task_name=task_type.name, task_description=_('Export search results for "%s".') % query, creator=user ) task_type.apply_async( args=[query, task.id], kwargs={}, task_id=task.id ) else: response = solr.query_grouped( settings.SOLR_DATA_CORE, ' AND '.join(solr_query_bits), 'dataset_slug', offset=offset, limit=limit, group_limit=group_limit, group_offset=group_offset ) groups = response['grouped']['dataset_slug']['groups'] page = PandaPaginator( request.GET, groups, resource_uri=request.path_info, count=response['grouped']['dataset_slug']['ngroups'] ).page() datasets = [] for group in groups: dataset_slug = group['groupValue'] results = group['doclist'] try: dataset = Dataset.objects.get(slug=dataset_slug) # In the event that stale data exists in Solr, skip this dataset, # request the invalid data be purged and return the other results. # Pagination may be wrong, but this is the most functional solution. (#793) except Dataset.DoesNotExist: PurgeDataTask.apply_async(args=[dataset_slug]) solr.delete(settings.SOLR_DATASETS_CORE, 'slug:%s' % dataset_slug) page['meta']['total_count'] -= 1 continue dataset_resource = DatasetResource() dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request) dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle) dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle) objects = [SolrObject(obj) for obj in results['docs']] dataset_search_url = reverse('api_dataset_data_list', kwargs={ 'api_name': self._meta.api_name, 'dataset_resource_name': 'dataset', 'resource_name': 'data', 'dataset_slug': dataset.slug }) data_page = PandaPaginator( { 'limit': str(group_limit), 'offset': str(group_offset), 'q': query }, objects, resource_uri=dataset_search_url, count=results['numFound'] ).page() dataset_bundle.data.update(data_page) dataset_bundle.data['objects'] = [] for obj in objects: data_bundle = self.build_bundle(obj=obj, request=request) data_bundle = self.full_dehydrate(data_bundle) dataset_bundle.data['objects'].append(data_bundle) datasets.append(dataset_bundle.data) page['objects'] = datasets # Log query SearchLog.objects.create(user=user, dataset=None, query=query) self.log_throttled_access(request) if export: return self.create_response(request, _('Export queued.')) else: return self.create_response(request, page)