Exemple #1
0
    def test_reindex_complex(self):
        upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME)
        self.dataset.import_data(self.user, upload)

        utils.wait()

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user, typed_columns=[True for c in upload.columns])

        utils.wait()

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime', 'empty_column', ''])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'datetime', 'int', 'bool', 'float', 'datetime', 'datetime', 'NoneType', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [True for c in upload.columns])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_unicode_text', 'column_datetime_date', 'column_int_integer', 'column_bool_boolean', 'column_float_float', 'column_datetime_time', 'column_datetime_datetime', 'column_NoneType_empty_column', 'column_unicode_'])
        self.assertEqual(dataset.row_count, 5)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_bool_boolean:true')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_text:"Chicago Tribune"')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_time:[9999-12-31T04:13:01Z TO *]')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_date:[1971-01-01T00:00:00Z TO NOW]')['response']['numFound'], 1)
Exemple #2
0
    def test_reindex(self):
        self.dataset.import_data(self.user, self.upload)

        utils.wait()

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user, typed_columns=[True, False, True, True])

        utils.wait()

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer'])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
Exemple #3
0
    def test_import_additional_xlsx_typed_columns(self):
        self.dataset.import_data(self.user, self.upload)

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True])

        second_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME)
        
        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.import_data(self.user, second_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        
        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer'])
        self.assertEqual(dataset.row_count, 8)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
    def test_delete(self):
        upload = utils.get_test_data_upload(self.user, self.dataset)
        upload_id = upload.id
        path = upload.get_path()

        self.assertEqual(os.path.isfile(path), True)

        solr.delete(settings.SOLR_DATA_CORE, '*:*')
        self.dataset.import_data(self.user, upload)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)

        upload = DataUpload.objects.get(id=upload_id)
        
        dataset = Dataset.objects.get(id=self.dataset.id)
        self.assertEqual(dataset.initial_upload, upload)
        self.assertEqual(dataset.row_count, 4)

        upload.delete()

        # Ensure dataset still exists
        dataset = Dataset.objects.get(id=self.dataset.id)
        self.assertEqual(dataset.initial_upload, None)
        self.assertEqual(dataset.row_count, 0)

        self.assertEqual(os.path.exists(path), False)

        with self.assertRaises(DataUpload.DoesNotExist):
            DataUpload.objects.get(id=upload_id)
        
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)
Exemple #5
0
    def test_reindex(self):
        self.dataset.import_data(self.user, self.upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user, typed_columns=[True, False, True, True])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        task = dataset.current_task

        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [True, False, True, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_int_id', None, 'column_unicode_last_name', 'column_unicode_employer'])
        self.assertEqual([c['min'] for c in dataset.column_schema], [1, None, None, None])
        self.assertEqual([c['max'] for c in dataset.column_schema], [4, None, None, None])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:2')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_last_name:Germuska')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_first_name:Joseph')['response']['numFound'], 0)
Exemple #6
0
    def test_reindex_complex(self):
        upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME)
        self.dataset.import_data(self.user, upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user, typed_columns=[True for c in upload.columns])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        task = dataset.current_task

        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime', 'empty_column', ''])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime', None, 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [True for c in upload.columns])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_unicode_text', 'column_date_date', 'column_int_integer', 'column_bool_boolean', 'column_float_float', 'column_time_time', 'column_datetime_datetime', None, 'column_unicode_'])
        self.assertEqual([c['min'] for c in dataset.column_schema], [None, u'1920-01-01T00:00:00', 40, None, 1.0, u'9999-12-31T00:00:00', u'1971-01-01T04:14:00', None, None])
        self.assertEqual([c['max'] for c in dataset.column_schema], [None, u'1971-01-01T00:00:00', 164, None, 41800000.01, u'9999-12-31T14:57:13', u'2048-01-01T14:57:00', None, None])
        self.assertEqual(dataset.row_count, 5)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_bool_boolean:true')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_unicode_text:"Chicago Tribune"')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]')['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_time_time:[9999-12-31T04:13:01Z TO *]')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_date_date:[1971-01-01T00:00:00Z TO NOW]')['response']['numFound'], 1)
Exemple #7
0
    def test_delete(self):
        upload = utils.get_test_data_upload(self.user, self.dataset)
        upload_id = upload.id
        path = upload.get_path()

        self.assertEqual(os.path.isfile(path), True)

        solr.delete(settings.SOLR_DATA_CORE, '*:*')
        self.dataset.import_data(self.user, upload)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 1)

        upload = DataUpload.objects.get(id=upload_id)

        dataset = Dataset.objects.get(id=self.dataset.id)
        self.assertEqual(dataset.initial_upload, upload)
        self.assertEqual(dataset.row_count, 4)

        upload.delete()

        # Ensure dataset still exists
        dataset = Dataset.objects.get(id=self.dataset.id)
        self.assertEqual(dataset.initial_upload, None)
        self.assertEqual(dataset.row_count, 0)

        self.assertEqual(os.path.exists(path), False)

        with self.assertRaises(DataUpload.DoesNotExist):
            DataUpload.objects.get(id=upload_id)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 0)
Exemple #8
0
    def test_import_additional_xlsx_typed_columns(self):
        self.dataset.import_data(self.user, self.upload)

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.reindex_data(self.user,
                                  typed_columns=[True, False, True, True])

        second_upload = utils.get_test_data_upload(
            self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME)

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.import_data(self.user, second_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema],
                         ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema],
                         [True, False, True, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [
            'column_int_id', None, 'column_unicode_last_name',
            'column_unicode_employer'
        ])
        self.assertEqual(dataset.row_count, 8)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 2)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_int_id:2')['response']['numFound'], 2)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_unicode_last_name:Germuska')['response']['numFound'],
            2)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_unicode_first_name:Joseph')['response']['numFound'], 0)
Exemple #9
0
    def test_import_xls(self):
        xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME)

        self.dataset.import_data(self.user, xls_upload)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.xls')

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(xls_upload.imported, True)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
Exemple #10
0
    def test_import_additional_data_same_columns(self):
        self.dataset.import_data(self.user, self.upload)

        xls_upload = utils.get_test_data_upload(self.user, self.dataset,
                                                utils.TEST_XLS_FILENAME)

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.import_data(self.user, xls_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        upload = DataUpload.objects.get(id=self.upload.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema],
                         ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema],
                         [None, None, None, None])
        self.assertEqual(dataset.row_count, 8)
        self.assertEqual(upload.imported, True)
        self.assertEqual(xls_upload.imported, True)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 2)
    def run(self, *args, **kwargs):
        from panda.models import SearchSubscription

        log = logging.getLogger(self.name)
        log.info('Running subscribed searches')

        subscriptions = SearchSubscription.objects.all()

        for sub in subscriptions:
            log.info('Running subscription: %s' % sub)

            since = sub.last_run.replace(microsecond=0, tzinfo=None)
            since = since.isoformat('T')

            sub.last_run = now()
            sub.save()
   
            solr_query = 'last_modified:[%s TO *] AND (%s)' % (since + 'Z', sub.query)

            if sub.dataset:
                solr_query += ' dataset_slug:%s' % (sub.dataset.slug)
            elif sub.category:
                dataset_slugs = sub.category.datasets.values_list('slug', flat=True)
                solr_query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs)

            response = solr.query(
                settings.SOLR_DATA_CORE,
                solr_query,
                offset=0,
                limit=0
            )

            count = response['response']['numFound'] 

            log.info('Found %i new results' % count)

            if count:
                if sub.dataset:
                    url = '#dataset/%s/search/%s/%s' % (sub.dataset.slug, sub.query_url, since)
                elif sub.category:
                    url = '#search/%s/%s/%s' % (sub.category.slug, sub.query, since)
                else:
                    url = '#search/all/%s/%s' % (sub.query, since)
                    
                notify(
                    sub.user,
                    'subscription_results',
                    'info',
                    url=url,
                    extra_context={
                        'query': sub.query,
                        'query_url': sub.query_url,
                        'category': sub.category,
                        'related_dataset': sub.dataset,
                        'count': count,
                        'since': since
                    }
                )

        log.info('Finished running subscribed searches')
Exemple #12
0
    def test_import_excel_xlsx(self):
        xlsx_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_EXCEL_XLSX_FILENAME)

        self.dataset.import_data(self.user, xlsx_upload)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.xlsx')

        utils.wait()

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        xlsx_upload = DataUpload.objects.get(id=xlsx_upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual(dataset.columns, ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(xlsx_upload.imported, True)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
Exemple #13
0
    def run(self, *args, **kwargs):
        from panda.models import SearchSubscription

        log = logging.getLogger(self.name)
        log.info('Running subscribed searches')

        subscriptions = SearchSubscription.objects.all()

        for sub in subscriptions:
            log.info('Running subscription: %s' % sub)

            since = sub.last_run.replace(microsecond=0, tzinfo=None)
            since = since.isoformat('T')

            sub.last_run = now()
            sub.save()

            solr_query = 'last_modified:[%s TO *] AND (%s)' % (since + 'Z',
                                                               sub.query)

            if sub.dataset:
                solr_query += ' dataset_slug:%s' % (sub.dataset.slug)
            elif sub.category:
                dataset_slugs = sub.category.datasets.values_list('slug',
                                                                  flat=True)
                solr_query += ' dataset_slug:(%s)' % ' '.join(dataset_slugs)

            response = solr.query(settings.SOLR_DATA_CORE,
                                  solr_query,
                                  offset=0,
                                  limit=0)

            count = response['response']['numFound']

            log.info('Found %i new results' % count)

            if count:
                if sub.dataset:
                    url = '#dataset/%s/search/%s/%s' % (sub.dataset.slug,
                                                        sub.query_url, since)
                elif sub.category:
                    url = '#search/%s/%s/%s' % (sub.category.slug, sub.query,
                                                since)
                else:
                    url = '#search/all/%s/%s' % (sub.query, since)

                notify(sub.user,
                       'subscription_results',
                       'info',
                       url=url,
                       extra_context={
                           'query': sub.query,
                           'query_url': sub.query_url,
                           'category': sub.category,
                           'related_dataset': sub.dataset,
                           'count': count,
                           'since': since
                       })

        log.info('Finished running subscribed searches')
Exemple #14
0
    def test_import_data(self):
        response = self.client.get(
            '/api/1.0/dataset/%s/import/%i/' %
            (self.dataset.slug, self.upload.id), **self.auth_headers)

        self.assertEqual(response.status_code, 200)

        body = json.loads(response.content)

        self.assertNotEqual(body['current_task'], None)
        self.assertEqual(body['current_task']['task_name'],
                         'panda.tasks.import.csv')

        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual(self.dataset.row_count, 4)
        self.assertEqual([c['name'] for c in self.dataset.column_schema],
                         self.upload.columns)
        self.assertEqual(self.dataset.initial_upload, self.upload)
        self.assertEqual(self.dataset.sample_data, self.upload.sample_data)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertEqual(task.task_name, 'panda.tasks.import.csv')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 1)
Exemple #15
0
 def _count_rows(self):
     """
     Count the number of rows currently stored in Solr for this Dataset.
     Useful for sanity checks.
     """
     return solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' %
                       self.slug)['response']['numFound']
Exemple #16
0
    def test_import_data(self):
        response = self.client.get('/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers)

        utils.wait() 

        self.assertEqual(response.status_code, 200)

        body = json.loads(response.content)

        self.assertNotEqual(body['current_task'], None)
        self.assertEqual(body['current_task']['task_name'], 'panda.tasks.import.csv')
        
        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual(self.dataset.row_count, 4)
        self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns)
        self.assertEqual(self.dataset.initial_upload, self.upload)
        self.assertEqual(self.dataset.sample_data, self.upload.sample_data)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertEqual(task.task_name, 'panda.tasks.import.csv')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
Exemple #17
0
    def test_reindex_data(self):
        response = self.client.get('/api/1.0/dataset/%s/import/%i/' % (self.dataset.slug, self.upload.id), **self.auth_headers)

        utils.wait() 

        response = self.client.get('/api/1.0/dataset/%s/reindex/?typed_columns=True,False,False,False' % (self.dataset.slug), **self.auth_headers)

        utils.wait() 

        self.assertEqual(response.status_code, 200)
        
        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual(self.dataset.row_count, 4)
        self.assertEqual([c['name'] for c in self.dataset.column_schema], self.upload.columns)
        self.assertEqual(self.dataset.initial_upload, self.upload)
        self.assertEqual(self.dataset.sample_data, self.upload.sample_data)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertEqual(task.task_name, 'panda.tasks.reindex')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:3')['response']['numFound'], 1)
Exemple #18
0
    def test_import_csv_with_schema_overrides(self):

        overrides = {
            'id': {'indexed': True, 'type': 'float'},
            'last_name': {'indexed': True},
        }
        self.dataset.import_data(self.user, self.upload, schema_overrides=overrides)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.csv')

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        upload = DataUpload.objects.get(id=self.upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual(upload.guessed_types, ['int', 'unicode', 'unicode', 'unicode'])
        #NOTE: Without overrides, float value for type would be "int" (per guessed_types) and all indexed_names would be None
        self.assertEqual([c['type'] for c in dataset.column_schema], ['float', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], ['column_float_id', None, 'column_unicode_last_name', None])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(upload.imported, True)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
Exemple #19
0
    def test_delete(self):
        self.dataset.import_data(self.user, self.upload)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)

        dataset_id = self.dataset.id
        self.dataset.delete()

        with self.assertRaises(Dataset.DoesNotExist):
            Dataset.objects.get(id=dataset_id)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 0)

        response = solr.query(settings.SOLR_DATASETS_CORE, 'contributors', sort='slug asc')

        self.assertEqual(response['response']['numFound'], 0)
Exemple #20
0
    def test_import_xls(self):
        xls_upload = utils.get_test_data_upload(self.user, self.dataset,
                                                utils.TEST_XLS_FILENAME)

        self.dataset.import_data(self.user, xls_upload)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.xls')

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema],
                         ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema],
                         [None, None, None, None])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(xls_upload.imported, True)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 1)
Exemple #21
0
    def test_reindex_data(self):
        response = self.client.get(
            '/api/1.0/dataset/%s/import/%i/' %
            (self.dataset.slug, self.upload.id), **self.auth_headers)

        response = self.client.get(
            '/api/1.0/dataset/%s/reindex/?typed_columns=True,False,False,False'
            % (self.dataset.slug), **self.auth_headers)

        self.assertEqual(response.status_code, 200)

        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual(self.dataset.row_count, 4)
        self.assertEqual([c['name'] for c in self.dataset.column_schema],
                         self.upload.columns)
        self.assertEqual(self.dataset.initial_upload, self.upload)
        self.assertEqual(self.dataset.sample_data, self.upload.sample_data)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertEqual(task.status, 'SUCCESS')
        self.assertEqual(task.task_name, 'panda.tasks.reindex')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_int_id:3')['response']['numFound'], 1)
Exemple #22
0
    def test_import_additional_data_different_columns(self):
        self.dataset.import_data(self.user, self.upload)

        xls_upload = utils.get_test_data_upload(self.user, self.dataset,
                                                utils.TEST_XLS_FILENAME)
        xls_upload.columns = [
            'id', 'first_name', 'last_name', 'employer', 'MORE COLUMNS!'
        ]
        xls_upload.save()

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertRaises(DataImportError, self.dataset.import_data, self.user,
                          xls_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        upload = DataUpload.objects.get(id=self.upload.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(upload.imported, True)
        self.assertEqual(xls_upload.imported, False)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 1)
Exemple #23
0
    def test_import_additional_data_different_columns(self):
        self.dataset.import_data(self.user, self.upload)

        utils.wait()

        xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME)
        xls_upload.columns = ['id', 'first_name', 'last_name', 'employer', 'MORE COLUMNS!']
        xls_upload.save()
        
        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertRaises(DataImportError, self.dataset.import_data, self.user, xls_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        upload = DataUpload.objects.get(id=self.upload.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)
        
        self.assertEqual(dataset.columns, ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(upload.imported, True)
        self.assertEqual(xls_upload.imported, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 1)
Exemple #24
0
    def test_import_additional_data_same_columns(self):
        self.dataset.import_data(self.user, self.upload)

        utils.wait()

        xls_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_XLS_FILENAME)
        
        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        self.dataset.import_data(self.user, xls_upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        upload = DataUpload.objects.get(id=self.upload.id)
        xls_upload = DataUpload.objects.get(id=xls_upload.id)
        
        self.assertEqual([c['name'] for c in dataset.column_schema], ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, None, None, None])
        self.assertEqual(dataset.row_count, 8)
        self.assertEqual(upload.imported, True)
        self.assertEqual(xls_upload.imported, True)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'Christopher')['response']['numFound'], 2)
Exemple #25
0
    def test_reindex(self):
        self.dataset.import_data(self.user, self.upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user,
                             typed_columns=[True, False, True, True])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        task = dataset.current_task

        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['id', 'first_name', 'last_name', 'employer'])
        self.assertEqual([c['type'] for c in dataset.column_schema],
                         ['int', 'unicode', 'unicode', 'unicode'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema],
                         [True, False, True, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [
            'column_int_id', None, 'column_unicode_last_name',
            'column_unicode_employer'
        ])
        self.assertEqual([c['min'] for c in dataset.column_schema],
                         [1, None, None, None])
        self.assertEqual([c['max'] for c in dataset.column_schema],
                         [4, None, None, None])
        self.assertEqual(dataset.row_count, 4)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_int_id:2')['response']['numFound'], 1)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_unicode_last_name:Germuska')['response']['numFound'],
            1)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_unicode_first_name:Joseph')['response']['numFound'], 0)
Exemple #26
0
    def test_change_user_reindex(self):
        solr.delete(settings.SOLR_DATASETS_CORE, '*:*') 

        self.user.first_name = 'bazbarfoo'
        self.user.save()

        dataset = utils.get_test_dataset(self.user)
        upload = utils.get_test_data_upload(self.user, dataset)
        
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
        old_name = dataset.creator.first_name

        dataset.creator.first_name = 'foobarbaz'
        dataset.creator.save()

        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, old_name)['response']['numFound'], 0)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
Exemple #27
0
    def test_change_user_reindex(self):
        solr.delete(settings.SOLR_DATASETS_CORE, '*:*') 

        self.user.first_name = 'bazbarfoo'
        self.user.save()

        dataset = utils.get_test_dataset(self.user)
        upload = utils.get_test_data_upload(self.user, dataset)
        
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
        old_name = dataset.creator.first_name

        dataset.creator.first_name = 'foobarbaz'
        dataset.creator.save()

        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, old_name)['response']['numFound'], 0)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, dataset.creator.first_name)['response']['numFound'], 1)
Exemple #28
0
    def search_dataset_data(self, request, **kwargs):
        """
        Perform a full-text search on only one dataset.

        See ``get_list``.
        """
        dataset = Dataset.objects.get(slug=kwargs['dataset_slug'])

        query = request.GET.get('q', '')
        since = request.GET.get('since', None)
        limit = int(
            request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get('offset', 0))
        sort = request.GET.get('sort', '_docid_ asc')

        if query:
            solr_query = 'dataset_slug:%s AND (%s)' % (dataset.slug, query)
        else:
            solr_query = 'dataset_slug:%s' % dataset.slug

        if since:
            solr_query += ' AND last_modified:[' + since + 'Z TO *]'

        response = solr.query(settings.SOLR_DATA_CORE,
                              solr_query,
                              offset=offset,
                              sort=sort,
                              limit=limit)

        dataset_resource = DatasetResource()
        dataset_bundle = dataset_resource.build_bundle(obj=dataset,
                                                       request=request)
        dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle)
        dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle)

        results = [SolrObject(d) for d in response['response']['docs']]

        page = PandaPaginator(request.GET,
                              results,
                              resource_uri=request.path_info,
                              count=response['response']['numFound']).page()

        dataset_bundle.data.update(page)
        dataset_bundle.data['objects'] = []

        for obj in results:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)
            dataset_bundle.data['objects'].append(bundle.data)

        # Because users may have authenticated via headers the request.user may
        # not be a full User instance. To be sure, we fetch one.
        user = UserProxy.objects.get(id=request.user.id)

        SearchLog.objects.create(user=user, dataset=dataset, query=query)

        return dataset_bundle
Exemple #29
0
    def get_row(self, external_id):
        """
        Fetch a row from this dataset.
        """
        response = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND external_id:%s' % (self.slug, external_id), limit=1)

        if len(response['response']['docs']) < 1:
            return None

        return response['response']['docs'][0]
Exemple #30
0
    def get_list(self, request, **kwargs):
        """
        List endpoint using Solr. Provides full-text search via the "q" parameter."
        """
        limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get('offset', 0))
        category_slug = request.GET.get('category', None)
        creator_email = request.GET.get('creator_email', None)
        query = request.GET.get('q', '')
        simple = True if request.GET.get('simple', 'false').lower() == 'true' else False

        if category_slug == settings.PANDA_UNCATEGORIZED_SLUG:
            category_id = settings.PANDA_UNCATEGORIZED_ID
        elif category_slug:
            category_id = Category.objects.get(slug=category_slug).id
        else:
            category_id = None

        if category_id is not None and query:
            q = 'categories:%s %s' % (category_id, query)
        elif category_id is not None:
            q = 'categories:%s' % category_id
        else:
            q = query

        if creator_email:
            datasets = Dataset.objects.filter(creator__email=creator_email)
            count = datasets.count()
            datasets = datasets[offset:offset + limit]
        else:
            response = solr.query(settings.SOLR_DATASETS_CORE, q, offset=offset, limit=limit, sort='creation_date desc')
            count = response['response']['numFound']
            
            dataset_slugs = [d['slug'] for d in response['response']['docs']]
            datasets = Dataset.objects.filter(slug__in=dataset_slugs)

        paginator = PandaPaginator(request.GET, datasets, resource_uri=request.path_info, count=count)
        page = paginator.page()

        objects = []

        for obj in datasets:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)

            # Prune attributes we don't care about
            if simple:
                bundle = self.simplify_bundle(bundle)

            objects.append(bundle)

        page['objects'] = objects

        return self.create_response(request, page)
Exemple #31
0
    def search_dataset_data(self, request, **kwargs):
        """
        Perform a full-text search on only one dataset.

        See ``get_list``.
        """
        dataset = Dataset.objects.get(slug=kwargs['dataset_slug'])

        query = request.GET.get('q', '')
        since = request.GET.get('since', None)
        limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get('offset', 0))

        if query:
            solr_query = 'dataset_slug:%s AND (%s)' % (dataset.slug, query)
        else:
            solr_query = 'dataset_slug:%s' % dataset.slug

        if since:
            solr_query += ' AND last_modified:[' + since + 'Z TO *]'

        response = solr.query(
            settings.SOLR_DATA_CORE,
            solr_query,
            offset=offset,
            limit=limit
        )

        dataset_resource = DatasetResource()
        dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request)
        dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle)
        dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle)
       
        results = [SolrObject(d) for d in response['response']['docs']]

        page = PandaPaginator(
            request.GET,
            results,
            resource_uri=request.path_info,
            count=response['response']['numFound']
        ).page() 
        
        dataset_bundle.data.update(page)
        dataset_bundle.data['objects'] = []

        for obj in results:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)
            dataset_bundle.data['objects'].append(bundle.data)
        
        SearchLog.objects.create(user=request.user, dataset=dataset, query=query)

        return dataset_bundle
Exemple #32
0
    def get_row(self, external_id):
        """
        Fetch a row from this dataset.
        """
        response = solr.query(settings.SOLR_DATA_CORE,
                              'dataset_slug:%s AND external_id:%s' %
                              (self.slug, external_id),
                              limit=1)

        if len(response['response']['docs']) < 1:
            return None

        return response['response']['docs'][0]
Exemple #33
0
    def test_reindex_with_currency(self):
        upload = utils.get_test_data_upload(self.user,
                                            self.dataset,
                                            filename=utils.TEST_MONEY)
        self.dataset.import_data(self.user, upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user,
                             typed_columns=[False, True],
                             column_types=['unicode', 'float'])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual([c['name'] for c in dataset.column_schema],
                         ['product', 'price'])
        self.assertEqual([c['type'] for c in dataset.column_schema],
                         ['unicode', 'float'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema],
                         [False, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema],
                         [None, 'column_float_price'])
        self.assertEqual([c['min'] for c in dataset.column_schema],
                         [None, 39.99])
        self.assertEqual([c['max'] for c in dataset.column_schema],
                         [None, 2599.00])

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_float_price:39.99')['response']['numFound'], 2)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_float_price:[1500 TO *]')['response']['numFound'], 2)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_float_price:*')['response']['numFound'], 8)
Exemple #34
0
    def test_delete(self):
        self.dataset.import_data(self.user, self.upload)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 1)

        dataset_id = self.dataset.id
        self.dataset.delete()

        with self.assertRaises(Dataset.DoesNotExist):
            Dataset.objects.get(id=dataset_id)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'Christopher')['response']['numFound'], 0)

        response = solr.query(settings.SOLR_DATASETS_CORE,
                              'contributors',
                              sort='slug asc')

        self.assertEqual(response['response']['numFound'], 0)
Exemple #35
0
    def test_reindex_with_currency(self):
        upload = utils.get_test_data_upload(self.user, self.dataset, filename=utils.TEST_MONEY)
        self.dataset.import_data(self.user, upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user, typed_columns=[False, True], column_types=['unicode', 'float'])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        self.assertEqual([c['name'] for c in dataset.column_schema], ['product', 'price'])
        self.assertEqual([c['type'] for c in dataset.column_schema], ['unicode', 'float'])
        self.assertEqual([c['indexed'] for c in dataset.column_schema], [False, True])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [None, 'column_float_price'])
        self.assertEqual([c['min'] for c in dataset.column_schema], [None, 39.99])
        self.assertEqual([c['max'] for c in dataset.column_schema], [None, 2599.00])

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:39.99')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:[1500 TO *]')['response']['numFound'], 2)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_float_price:*')['response']['numFound'], 8)
Exemple #36
0
    def search_dataset_data(self, request, **kwargs):
        """
        Perform a full-text search on only one dataset.

        See ``get_list``.
        """
        dataset = Dataset.objects.get(slug=kwargs["dataset_slug"])

        query = request.GET.get("q", "")
        since = request.GET.get("since", None)
        limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get("offset", 0))
        sort = request.GET.get("sort", "_docid_ asc")

        if query:
            solr_query = "dataset_slug:%s AND (%s)" % (dataset.slug, query)
        else:
            solr_query = "dataset_slug:%s" % dataset.slug

        if since:
            solr_query += " AND last_modified:[" + since + "Z TO *]"

        response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=offset, sort=sort, limit=limit)

        dataset_resource = DatasetResource()
        dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request)
        dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle)
        dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle)

        results = [SolrObject(d) for d in response["response"]["docs"]]

        page = PandaPaginator(
            request.GET, results, resource_uri=request.path_info, count=response["response"]["numFound"]
        ).page()

        dataset_bundle.data.update(page)
        dataset_bundle.data["objects"] = []

        for obj in results:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)
            dataset_bundle.data["objects"].append(bundle.data)

        # Because users may have authenticated via headers the request.user may
        # not be a full User instance. To be sure, we fetch one.
        user = UserProxy.objects.get(id=request.user.id)

        SearchLog.objects.create(user=user, dataset=dataset, query=query)

        return dataset_bundle
Exemple #37
0
    def test_add_row_typed(self):
        self.dataset.import_data(self.user, self.upload, 0)

        self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True])

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        new_row =['5', 'Somebody', 'Else', 'Somewhere']

        self.dataset.add_row(self.user, new_row, external_id='5')
        row = self.dataset.get_row('5')

        self.assertEqual(row['external_id'], '5')
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:5')['response']['numFound'], 1)
Exemple #38
0
    def test_search_stale_dataset(self):
        self.dataset.import_data(self.user, self.upload, 0)
        self.dataset.update_full_text()

        # Import second dataset so we can make sure both match 
        second_dataset = Dataset.objects.create(
            name='Second dataset',
            creator=self.dataset.creator)

        second_dataset.import_data(self.user, self.upload, 0)
        second_dataset.update_full_text()

        # Manually delete second dataset to simulate an integrity issue
        from django.db import connection, transaction
        cursor = connection.cursor()

        cursor.execute("DELETE FROM panda_dataset WHERE slug='%s'" % second_dataset.slug)
        transaction.commit_unless_managed()

        # Verify Solr data still exists
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.dataset.slug)['response']['numFound'], 4)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % second_dataset.slug)['response']['numFound'], 4)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.dataset.slug)['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % second_dataset.slug)['response']['numFound'], 1)

        # Execute search, which should invoke purge as a side-effect
        response = self.client.get('/api/1.0/data/?q=Christopher', **self.auth_headers)
        self.assertEqual(response.status_code, 200)

        # Verify Solr data has been purged
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % self.dataset.slug)['response']['numFound'], 4)
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s' % second_dataset.slug)['response']['numFound'], 0)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % self.dataset.slug)['response']['numFound'], 1)
        self.assertEqual(solr.query(settings.SOLR_DATASETS_CORE, 'slug:%s' % second_dataset.slug)['response']['numFound'], 0)

        body = json.loads(response.content)

        # Verify that the group count is correct
        self.assertEqual(body['meta']['total_count'], 1)
        self.assertEqual(len(body['objects']), 1)

        # Verify that each matched dataset includes one result
        result_dataset = body['objects'][0]
        self.assertEqual(result_dataset['slug'], self.dataset.slug)
        self.assertEqual(result_dataset['meta']['total_count'], 1)
        self.assertEqual(len(result_dataset['objects']), 1)
Exemple #39
0
    def test_add_many_rows_typed(self):
        self.dataset.import_data(self.user, self.upload, 0)

        self.dataset.reindex_data(self.user, typed_columns=[True, False, True, True])

        # Refresh dataset so row_count is available
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        new_rows = [
            (['5', 'Somebody', 'Else', 'Somewhere'], 5),
            (['6', 'Another', 'Person', 'Somewhere'], 6)
        ]

        self.dataset.add_many_rows(self.user, new_rows)
        row = self.dataset.get_row('6')

        self.assertEqual(row['external_id'], '6')
        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'column_int_id:[5 TO 6]')['response']['numFound'], 2)
Exemple #40
0
    def test_get_datum(self):
        self.dataset.import_data(self.user, self.upload, 0)

        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        # Get id of a datum in Solr
        datum = solr.query(settings.SOLR_DATA_CORE, 'dataset_slug:%s AND Brian' % self.dataset.slug)['response']['docs'][0]

        response = self.client.get('/api/1.0/dataset/%s/data/%s/' % (self.dataset.slug, datum['external_id']), **self.auth_headers)

        self.assertEqual(response.status_code, 200)

        body = json.loads(response.content)
        
        # Verify that correct attributes of the dataset are attached
        self.assertEqual(body['external_id'], datum['external_id'])
        self.assertEqual(body['dataset'], '/api/1.0/dataset/%s/' % self.dataset.slug)
Exemple #41
0
    def test_add_row_typed(self):
        self.dataset.import_data(self.user, self.upload, 0)

        self.dataset.reindex_data(self.user,
                                  typed_columns=[True, False, True, True])

        # Refresh from database
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        new_row = ['5', 'Somebody', 'Else', 'Somewhere']

        self.dataset.add_row(self.user, new_row, external_id='5')
        row = self.dataset.get_row('5')

        self.assertEqual(row['external_id'], '5')
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_int_id:5')['response']['numFound'], 1)
Exemple #42
0
    def test_add_many_rows_typed(self):
        self.dataset.import_data(self.user, self.upload, 0)

        self.dataset.reindex_data(self.user,
                                  typed_columns=[True, False, True, True])

        # Refresh dataset so row_count is available
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        new_rows = [(['5', 'Somebody', 'Else', 'Somewhere'], 5),
                    (['6', 'Another', 'Person', 'Somewhere'], 6)]

        self.dataset.add_many_rows(self.user, new_rows)
        row = self.dataset.get_row('6')

        self.assertEqual(row['external_id'], '6')
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_int_id:[5 TO 6]')['response']['numFound'], 2)
Exemple #43
0
    def test_import_encoded_data(self):
        """
        This tests for a complicated case where a UnicodeDecodeError
        during import could be masked by an AttrbiuteError in the
        return handler.
        """
        old_sniffer_size = settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE
        settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = 50

        data_upload = utils.get_test_data_upload(
            self.user, self.dataset, utils.TEST_LATIN1_DATA_FILENAME)

        self.dataset.import_data(self.user, data_upload)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.csv')

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        data_upload = DataUpload.objects.get(id=data_upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual(len(dataset.column_schema), 8)
        self.assertEqual(dataset.row_count, None)
        self.assertEqual(data_upload.imported, False)
        self.assertEqual(task.status, 'FAILURE')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual('encoded' in task.traceback, True)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'walking')['response']['numFound'], 0)

        settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = old_sniffer_size
Exemple #44
0
    def search_dataset_data(self, request, **kwargs):
        """
        Perform a full-text search on only one dataset.

        See ``get_list``.
        """
        dataset = Dataset.objects.get(slug=kwargs["dataset_slug"])

        query = request.GET.get("q", None)
        limit = int(request.GET.get("limit", settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get("offset", 0))

        if query:
            solr_query = "dataset_slug:%s AND %s" % (dataset.slug, query)
        else:
            solr_query = "dataset_slug:%s" % dataset.slug

        response = solr.query(settings.SOLR_DATA_CORE, solr_query, offset=offset, limit=limit)

        dataset_resource = DatasetResource()
        dataset_bundle = dataset_resource.build_bundle(obj=dataset, request=request)
        dataset_bundle = dataset_resource.full_dehydrate(dataset_bundle)
        dataset_bundle = dataset_resource.simplify_bundle(dataset_bundle)

        results = [SolrObject(d) for d in response["response"]["docs"]]

        page = PandaPaginator(
            request.GET, results, resource_uri=request.path_info, count=response["response"]["numFound"]
        ).page()

        dataset_bundle.data.update(page)
        dataset_bundle.data["objects"] = []

        for obj in results:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)
            dataset_bundle.data["objects"].append(bundle.data)

        return dataset_bundle
Exemple #45
0
    def test_get_datum(self):
        self.dataset.import_data(self.user, self.upload, 0)

        # Refetch dataset so that attributes will be updated
        self.dataset = Dataset.objects.get(id=self.dataset.id)

        # Get id of a datum in Solr
        datum = solr.query(settings.SOLR_DATA_CORE,
                           'dataset_slug:%s AND Brian' %
                           self.dataset.slug)['response']['docs'][0]

        response = self.client.get(
            '/api/1.0/dataset/%s/data/%s/' %
            (self.dataset.slug, datum['external_id']), **self.auth_headers)

        self.assertEqual(response.status_code, 200)

        body = json.loads(response.content)

        # Verify that correct attributes of the dataset are attached
        self.assertEqual(body['external_id'], datum['external_id'])
        self.assertEqual(body['dataset'],
                         '/api/1.0/dataset/%s/' % self.dataset.slug)
Exemple #46
0
    def test_import_encoded_data(self):
        """
        This tests for a complicated case where a UnicodeDecodeError
        during import could be masked by an AttrbiuteError in the
        return handler.
        """
        old_sniffer_size = settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE
        settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = 50

        data_upload = utils.get_test_data_upload(self.user, self.dataset, utils.TEST_LATIN1_DATA_FILENAME)

        self.dataset.import_data(self.user, data_upload)

        task = self.dataset.current_task

        self.assertNotEqual(task, None)
        self.assertNotEqual(task.id, None)
        self.assertEqual(task.task_name, 'panda.tasks.import.csv')

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        data_upload = DataUpload.objects.get(id=data_upload.id)
        task = TaskStatus.objects.get(id=task.id)

        self.assertEqual(len(dataset.column_schema), 8)
        self.assertEqual(dataset.row_count, None)
        self.assertEqual(data_upload.imported, False)
        self.assertEqual(task.status, 'FAILURE')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual('encoded' in task.traceback, True)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(solr.query(settings.SOLR_DATA_CORE, 'walking')['response']['numFound'], 0)

        settings.PANDA_SNIFFER_MAX_SAMPLE_SIZE = old_sniffer_size
Exemple #47
0
    def get_list(self, request, **kwargs):
        """
        List endpoint using Solr. Provides full-text search via the "q" parameter."
        """
        limit = int(
            request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS))
        offset = int(request.GET.get('offset', 0))
        category_slug = request.GET.get('category', None)
        creator_email = request.GET.get('creator_email', None)
        query = request.GET.get('q', '')
        simple = True if request.GET.get('simple',
                                         'false').lower() == 'true' else False

        if category_slug == settings.PANDA_UNCATEGORIZED_SLUG:
            category_id = settings.PANDA_UNCATEGORIZED_ID
        elif category_slug:
            category_id = Category.objects.get(slug=category_slug).id
        else:
            category_id = None

        if category_id is not None and query:
            q = 'categories:%s %s' % (category_id, query)
        elif category_id is not None:
            q = 'categories:%s' % category_id
        else:
            q = query

        if creator_email:
            datasets = Dataset.objects.filter(creator__email=creator_email)
            count = datasets.count()
            datasets = datasets[offset:offset + limit]
        else:
            response = solr.query(settings.SOLR_DATASETS_CORE,
                                  q,
                                  offset=offset,
                                  limit=limit,
                                  sort='creation_date desc')
            count = response['response']['numFound']

            dataset_slugs = [d['slug'] for d in response['response']['docs']]
            datasets = Dataset.objects.filter(slug__in=dataset_slugs)

        paginator = PandaPaginator(request.GET,
                                   datasets,
                                   resource_uri=request.path_info,
                                   count=count)
        page = paginator.page()

        objects = []

        for obj in datasets:
            bundle = self.build_bundle(obj=obj, request=request)
            bundle = self.full_dehydrate(bundle)

            # Prune attributes we don't care about
            if simple:
                bundle = self.simplify_bundle(bundle)

            objects.append(bundle)

        page['objects'] = objects

        return self.create_response(request, page)
Exemple #48
0
    def test_reindex_complex(self):
        upload = utils.get_test_data_upload(
            self.user, self.dataset, filename=utils.TEST_CSV_TYPES_FILENAME)
        self.dataset.import_data(self.user, upload)

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)

        dataset.reindex_data(self.user,
                             typed_columns=[True for c in upload.columns])

        # Refresh from database
        dataset = Dataset.objects.get(id=self.dataset.id)
        task = dataset.current_task

        self.assertEqual(task.status, 'SUCCESS')
        self.assertNotEqual(task.start, None)
        self.assertNotEqual(task.end, None)
        self.assertEqual(task.traceback, None)

        self.assertEqual([c['name'] for c in dataset.column_schema], [
            'text', 'date', 'integer', 'boolean', 'float', 'time', 'datetime',
            'empty_column', ''
        ])
        self.assertEqual([c['type'] for c in dataset.column_schema], [
            'unicode', 'date', 'int', 'bool', 'float', 'time', 'datetime',
            None, 'unicode'
        ])
        self.assertEqual([c['indexed'] for c in dataset.column_schema],
                         [True for c in upload.columns])
        self.assertEqual([c['indexed_name'] for c in dataset.column_schema], [
            'column_unicode_text', 'column_date_date', 'column_int_integer',
            'column_bool_boolean', 'column_float_float', 'column_time_time',
            'column_datetime_datetime', None, 'column_unicode_'
        ])
        self.assertEqual([c['min'] for c in dataset.column_schema], [
            None, u'1920-01-01T00:00:00', 40, None, 1.0,
            u'9999-12-31T00:00:00', u'1971-01-01T04:14:00', None, None
        ])
        self.assertEqual([c['max'] for c in dataset.column_schema], [
            None, u'1971-01-01T00:00:00', 164, None, 41800000.01,
            u'9999-12-31T14:57:13', u'2048-01-01T14:57:00', None, None
        ])
        self.assertEqual(dataset.row_count, 5)
        self.assertEqual(dataset.locked, False)

        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_bool_boolean:true')['response']['numFound'], 2)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_unicode_text:"Chicago Tribune"')['response']
            ['numFound'], 1)
        self.assertEqual(
            solr.query(
                settings.SOLR_DATA_CORE,
                'column_datetime_datetime:[1971-01-01T01:01:01Z TO NOW]')
            ['response']['numFound'], 1)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_time_time:[9999-12-31T04:13:01Z TO *]')
            ['response']['numFound'], 2)
        self.assertEqual(
            solr.query(settings.SOLR_DATA_CORE,
                       'column_date_date:[1971-01-01T00:00:00Z TO NOW]')
            ['response']['numFound'], 1)
Exemple #49
0
    def test_metadata_searchable(self):
        response = solr.query(settings.SOLR_DATASETS_CORE,
                              'contributors',
                              sort='slug asc')

        self.assertEqual(response['response']['numFound'], 1)
Exemple #50
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset

        log = logging.getLogger(self.name)
        log.info('Beginning reindex, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Reindexing failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to reindex'))

        if self.is_aborted():
            task_status.abort(ugettext('Aborted during preparation'))

            log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = 'dataset_slug: %s' % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE,
                                      query,
                                      limit=SOLR_READ_BUFFER_SIZE,
                                      offset=i)
                read_buffer = response['response']['docs']

            data = read_buffer.pop(0)
            row = json.loads(data['data'])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data['id'] = data['id']
            new_data['data_upload_id'] = data['data_upload_id']
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update(
                    ugettext('%.0f%% complete') %
                    floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        ugettext('Aborted after reindexing %.0f%%') %
                        floor(float(i) / float(dataset.row_count) * 100))

                    log.warning('Reindex aborted, dataset_slug: %s' %
                                dataset_slug)

                    return

                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        dataset.column_schema = data_typer.schema
        dataset.save()

        log.info('Finished reindex, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemple #51
0
    def test_metadata_searchable(self):
        response = solr.query(settings.SOLR_DATASETS_CORE, 'contributors', sort='slug asc')

        self.assertEqual(response['response']['numFound'], 1)
Exemple #52
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset
        
        log = logging.getLogger(self.name)
        log.info('Beginning reindex, dataset_slug: %s' % dataset_slug)

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Reindexing failed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        task_status = dataset.current_task
        task_status.begin(ugettext('Preparing to reindex'))

        if self.is_aborted():
            task_status.abort(ugettext('Aborted during preparation'))

            log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value('PERF', 'TASK_THROTTLE')

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = 'dataset_slug: %s' % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i)
                read_buffer = response['response']['docs']

            data = read_buffer.pop(0)
            row = json.loads(data['data'])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data['id'] = data['id'] 
            new_data['data_upload_id'] = data['data_upload_id']
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update(ugettext('%.0f%% complete') % floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(ugettext('Aborted after reindexing %.0f%%') % floor(float(i) / float(dataset.row_count) * 100))

                    log.warning('Reindex aborted, dataset_slug: %s' % dataset_slug)

                    return
            
                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update(ugettext('100% complete'))

        # Refresh dataset
        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning('Reindexing could not be completed due to Dataset being deleted, dataset_slug: %s' % dataset_slug)

            return

        dataset.column_schema = data_typer.schema 
        dataset.save()

        log.info('Finished reindex, dataset_slug: %s' % dataset_slug)

        return data_typer
Exemple #53
0
    def run(self, query, task_status_id, filename=None, *args, **kwargs):
        """
        Execute export.
        """
        from panda.models import Dataset, TaskStatus

        log = logging.getLogger(self.name)
        log.info('Beginning export, query: %s' % query)

        task_status = TaskStatus.objects.get(id=task_status_id)
        task_status.begin('Preparing to import')

        if not filename:
            filename = 'search_export_%s' % (now().isoformat())

        zip_name = '%s.zip' % filename

        path = os.path.join(settings.EXPORT_ROOT, filename)
        zip_path = os.path.join(settings.EXPORT_ROOT, zip_name)

        try:
            os.makedirs(os.path.realpath(path))
        except:
            pass

        zipfile = ZipFile(zip_path, 'w')

        response = solr.query_grouped(settings.SOLR_DATA_CORE,
                                      query,
                                      'dataset_slug',
                                      offset=0,
                                      limit=1000,
                                      group_limit=0,
                                      group_offset=0)
        groups = response['grouped']['dataset_slug']['groups']

        datasets = {}

        for group in groups:
            dataset_slug = group['groupValue']
            count = group['doclist']['numFound']

            datasets[dataset_slug] = count

        total_n = 0
        throttle = config_value('PERF', 'TASK_THROTTLE')

        for dataset_slug in datasets:
            try:
                dataset = Dataset.objects.get(slug=dataset_slug)
            except Dataset.DoesNotExist:
                log.warning(
                    'Skipping part of export due to Dataset being deleted, dataset_slug: %s'
                    % dataset_slug)

                continue

            filename = '%s.csv' % dataset_slug
            file_path = os.path.join(path, filename)

            f = open(file_path, 'w')
            writer = CSVKitWriter(f)

            # Header
            writer.writerow([c['name'] for c in dataset.column_schema])

            response = solr.query(settings.SOLR_DATA_CORE,
                                  query,
                                  offset=0,
                                  limit=0)

            # Update dataset and total counts for progress tracking
            datasets[dataset_slug] = response['response']['numFound']
            total_count = sum(datasets.values())

            n = 0

            while n < datasets[dataset_slug]:
                response = solr.query(settings.SOLR_DATA_CORE,
                                      'dataset_slug: %s %s' %
                                      (dataset_slug, query),
                                      offset=n,
                                      limit=SOLR_PAGE_SIZE)

                results = response['response']['docs']

                for row in results:
                    data = json.loads(row['data'])

                    writer.writerow(data)

                task_status.update(
                    '%.0f%% complete' %
                    floor(float(total_n) / float(total_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        'Aborted after exporting %.0f%%' %
                        floor(float(total_n) / float(total_count) * 100))

                    log.warning('Export aborted, query: %s' % query)

                    return

                n += SOLR_PAGE_SIZE
                total_n += response['response']['numFound']

                time.sleep(throttle)

            f.close()

            # Add to zip and nuke temp file
            zipfile.write(file_path, filename)
            os.remove(file_path)

        # Finish zip file and nuke temp directory
        zipfile.close()
        os.rmdir(path)

        task_status.update('100% complete')

        log.info('Finished export, query: %s' % query)

        return zip_name
Exemple #54
0
    def run(self, dataset_slug, *args, **kwargs):
        """
        Execute reindex.
        """
        from panda.models import Dataset

        log = logging.getLogger(self.name)
        log.info("Beginning reindex, dataset_slug: %s" % dataset_slug)

        dataset = Dataset.objects.get(slug=dataset_slug)

        task_status = dataset.current_task
        task_status.begin("Preparing to reindex")

        if self.is_aborted():
            task_status.abort("Aborted during preparation")

            log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug)

            return

        read_buffer = []
        add_buffer = []
        data_typer = DataTyper(dataset.column_schema)
        throttle = config_value("PERF", "TASK_THROTTLE")

        i = 0

        while i < dataset.row_count:
            if not read_buffer:
                query = "dataset_slug: %s" % (dataset.slug)
                response = solr.query(settings.SOLR_DATA_CORE, query, limit=SOLR_READ_BUFFER_SIZE, offset=i)
                read_buffer = response["response"]["docs"]

            data = read_buffer.pop(0)
            row = json.loads(data["data"])

            new_data = utils.solr.make_data_row(dataset, row)
            new_data["id"] = data["id"]
            new_data["data_upload_id"] = data["data_upload_id"]
            new_data = data_typer(new_data, row)

            add_buffer.append(new_data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(settings.SOLR_DATA_CORE, add_buffer)

                add_buffer = []

                task_status.update("%.0f%% complete" % floor(float(i) / float(dataset.row_count) * 100))

                if self.is_aborted():
                    task_status.abort(
                        "Aborted after reindexing %.0f%%" % floor(float(i) / float(dataset.row_count) * 100)
                    )

                    log.warning("Reindex aborted, dataset_slug: %s" % dataset_slug)

                    return

                time.sleep(throttle)

            i += 1

        if add_buffer:
            solr.add(settings.SOLR_DATA_CORE, add_buffer)
            add_buffer = []

        solr.commit(settings.SOLR_DATA_CORE)

        task_status.update("100% complete")

        # Refresh dataset
        dataset = Dataset.objects.get(slug=dataset_slug)
        dataset.column_schema = data_typer.schema
        dataset.save()

        log.info("Finished reindex, dataset_slug: %s" % dataset_slug)

        return data_typer
Exemple #55
0
    def run(self, dataset_slug, query=None, filename=None, *args, **kwargs):
        """
        Execute export.
        """
        from panda.models import Dataset

        log = logging.getLogger(self.name)
        log.info('Beginning export, dataset_slug:%s %s' %
                 (dataset_slug, query))

        try:
            dataset = Dataset.objects.get(slug=dataset_slug)
        except Dataset.DoesNotExist:
            log.warning(
                'Export failed due to Dataset being deleted, dataset_slug: %s'
                % dataset_slug)

            return

        task_status = dataset.current_task
        task_status.begin('Preparing to export')

        if not filename:
            filename = '%s_%s.csv' % (dataset_slug,
                                      datetime.datetime.utcnow().isoformat())
        else:
            filename = '%s.csv' % filename

        path = os.path.join(settings.EXPORT_ROOT, filename)

        try:
            os.makedirs(os.path.realpath(os.path.dirname(path)))
        except:
            pass

        f = open(path, 'w')
        writer = CSVKitWriter(f)

        # Header
        writer.writerow([c['name'] for c in dataset.column_schema])

        solr_query = 'dataset_slug:%s' % dataset_slug

        if query:
            solr_query = '%s %s' % (solr_query, query)

        response = solr.query(settings.SOLR_DATA_CORE,
                              solr_query,
                              offset=0,
                              limit=0)

        total_count = response['response']['numFound']
        n = 0
        throttle = config_value('PERF', 'TASK_THROTTLE')

        while n < total_count:
            response = solr.query(settings.SOLR_DATA_CORE,
                                  solr_query,
                                  offset=n,
                                  limit=SOLR_PAGE_SIZE)

            results = response['response']['docs']

            for row in results:
                data = json.loads(row['data'])

                writer.writerow(data)

            task_status.update('%.0f%% complete' %
                               floor(float(n) / float(total_count) * 100))

            if self.is_aborted():
                task_status.abort('Aborted after exporting %.0f%%' %
                                  floor(float(n) / float(total_count) * 100))

                log.warning('Export aborted, dataset_slug: %s' % dataset_slug)

                return

            n += SOLR_PAGE_SIZE

            time.sleep(throttle)

        f.close()

        task_status.update('100% complete')

        log.info('Finished export, dataset_slug:%s %s' % (dataset_slug, query))

        return filename