Beispiel #1
0
 def get_dataset(self, remote_id):
     '''Get or create a dataset given its remote ID (and its source)'''
     dataset = Dataset.objects(__raw__={
         'extras.harvest:remote_id': remote_id,
         'extras.harvest:domain': self.source.domain
     }).first()
     return dataset or Dataset()
Beispiel #2
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        can_edit = EditOrganizationPermission(self.organization)
        can_view = OrganizationPrivatePermission(self.organization)

        if self.organization.deleted and not can_view.can():
            abort(410)

        datasets = Dataset.objects(organization=self.organization).order_by('-temporal_coverage.end', '-metrics.reuses', '-metrics.followers').visible()
        reuses = Reuse.objects(organization=self.organization).order_by('-metrics.reuses', '-metrics.followers').visible()
        followers = (Follow.objects.followers(self.organization)
                                   .order_by('follower.fullname'))
        context.update({
            'reuses': reuses.paginate(1, self.page_size),
            'datasets': datasets.paginate(1, self.page_size),
            'followers': followers,
            'can_edit': can_edit,
            'can_view': can_view,
            'private_reuses': (
                list(Reuse.objects(organization=self.object).hidden())
                if can_view else []),
            'private_datasets': (
                list(Dataset.objects(organization=self.object).hidden())
                if can_view else []),
        })
        return context
Beispiel #3
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        can_edit = EditOrganizationPermission(self.organization)
        can_view = OrganizationPrivatePermission(self.organization)

        if self.organization.deleted and not can_view.can():
            abort(410)

        datasets = Dataset.objects(organization=self.organization).order_by(
            '-temporal_coverage.end', '-metrics.reuses',
            '-metrics.followers').visible()
        reuses = Reuse.objects(organization=self.organization).order_by(
            '-metrics.reuses', '-metrics.followers').visible()
        followers = (Follow.objects.followers(
            self.organization).order_by('follower.fullname'))
        context.update({
            'reuses':
            reuses.paginate(1, self.page_size),
            'datasets':
            datasets.paginate(1, self.page_size),
            'followers':
            followers,
            'can_edit':
            can_edit,
            'can_view':
            can_view,
            'private_reuses':
            (list(Reuse.objects(
                organization=self.object).hidden()) if can_view else []),
            'private_datasets':
            (list(Dataset.objects(
                organization=self.object).hidden()) if can_view else []),
        })
        return context
Beispiel #4
0
 def from_organizations(self, user, *organizations):
     from udata.models import Dataset, Reuse  # Circular imports.
     Qs = db.Q()
     for dataset in Dataset.objects(owner=user).visible():
         Qs |= db.Q(subject=dataset)
     for org in organizations:
         for dataset in Dataset.objects(organization=org).visible():
             Qs |= db.Q(subject=dataset)
     for reuse in Reuse.objects.owned_by(*[user.id] + list(organizations)):
         Qs |= db.Q(subject=reuse)
     return self(Qs)
Beispiel #5
0
 def get_dataset(self, remote_id):
     '''Get or create a dataset given its remote ID (and its source)
     We first try to match `source_id` to be source domain independent
     '''
     dataset = Dataset.objects(__raw__={
         'extras.harvest:remote_id': remote_id,
         '$or': [
             {'extras.harvest:domain': self.source.domain},
             {'extras.harvest:source_id': str(self.source.id)},
         ],
     }).first()
     return dataset or Dataset()
Beispiel #6
0
    def get(self, level):
        pipeline = [
            {'$project': {'territory': '$spatial.territories'}},
            {'$unwind': '$territory'},
            {'$match': {'territory.level': level}},
            {'$group': {'_id': '$territory.id', 'count': {'$sum': 1}}}
        ]
        features = []

        for row in Dataset.objects(spatial__territories__level=level).visible().aggregate(*pipeline):
            territory = Territory.objects.get(id=row['_id'])
            features.append({
                'id': str(territory.id),
                'type': 'Feature',
                'geometry': territory.geom,
                'properties': {
                    'name': territory.name,
                    'code': territory.code,
                    'level': territory.level,
                    'datasets': row['count']
                }
            })

        return {
            'type': 'FeatureCollection',
            'features': features
        }
Beispiel #7
0
    def test_attach_does_not_duplicate(self):
        attached_datasets = []
        for i in range(2):
            dataset = DatasetFactory.build()
            dataset.extras["harvest:domain"] = "test.org"
            dataset.extras["harvest:remote_id"] = str(i)
            dataset.save()
            attached_datasets.append(dataset)

        datasets = DatasetFactory.create_batch(3)

        with NamedTemporaryFile() as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["local", "remote"], delimiter=b";", quotechar=b'"')

            writer.writeheader()
            for index, dataset in enumerate(datasets):
                writer.writerow({"local": str(dataset.id), "remote": str(index)})
            csvfile.flush()

            result = actions.attach("test.org", csvfile.name)

        dbcount = Dataset.objects(**{"extras__harvest:remote_id__exists": True}).count()
        self.assertEqual(result.success, len(datasets))
        self.assertEqual(dbcount, result.success)
        for index, dataset in enumerate(datasets):
            dataset.reload()
            self.assertEqual(dataset.extras["harvest:domain"], "test.org")
            self.assertEqual(dataset.extras["harvest:remote_id"], str(index))
Beispiel #8
0
def openfield16():
    datasets = (Dataset.objects(
        badges__kind=OPENFIELD16).visible().order_by('-metrics.followers'))
    return theme.render('openfield16.html',
                        datasets=datasets,
                        badge=OPENFIELD16,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #9
0
    def test_purge_organizations(self):
        with self.autoindex():
            org = Organization.objects.create(name='delete me',
                                              description='XXX')
            resources = [ResourceFactory() for _ in range(2)]
            dataset = DatasetFactory(resources=resources, organization=org)

        # Upload organization's logo
        file = create_test_image()
        user = AdminFactory()
        self.login(user)
        response = self.post(url_for('api.organization_logo', org=org),
                             {'file': (file, 'test.png')},
                             json=False)
        self.assert200(response)

        # Delete organization
        response = self.delete(url_for('api.organization', org=org))
        self.assert204(response)

        tasks.purge_organizations()

        # Check organization's logo is deleted
        self.assertEqual(list(storages.avatars.list_files()), [])

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)

        indexed_dataset = DatasetSearch.get(id=dataset.id,
                                            using=es.client,
                                            index=es.index_name)
        self.assertIsNone(indexed_dataset.organization)
Beispiel #10
0
 def remote_reuses(self):
     # dataset_ids = (d.ext['harvest'].remote_id for d in Dataset.objects(ext__harvest__harvester=self.harvester.id))
     # response = self.get('package_list')
     # for dataset_id in response['result']:
     for dataset in Dataset.objects(ext__harvest__harvester=self.harvester.id).timeout(False):
         try:
             resp = self.get('related_list', {'id': dataset.ext['harvest'].remote_id})
         except:
             log.error('Unable to parse reuse for dataset %s', dataset.id)
             continue
         for details in resp['result']:
             reuse_url = details['url']
             urlhash = Reuse.hash_url(reuse_url)
             reuse, _ = Reuse.objects.get_or_create(urlhash=urlhash, auto_save=False)
             reuse.url = reuse_url
             reuse.title = details['title']
             reuse.description = details['description']
             reuse.type = details['type']
             # reuse.url = details['url']
             reuse.image_url = details.get('image_url')
             reuse.featured = bool(details.get('featured', False))
             reuse.created_at = parse(details['created'])
             if details.get('owner_id'):
                 reuse.owner = self.get_harvested(User, details['owner_id'])
             if not dataset in reuse.datasets:
                 reuse.datasets.append(dataset)
                 for tag in dataset.tags:
                     if not tag in reuse.tags:
                         reuse.tags.append(tag)
             yield reuse
Beispiel #11
0
 def remote_reuses(self):
     # dataset_ids = (d.ext['harvest'].remote_id for d in Dataset.objects(ext__harvest__harvester=self.harvester.id))
     # response = self.get('package_list')
     # for dataset_id in response['result']:
     for dataset in Dataset.objects(
             ext__harvest__harvester=self.harvester.id).timeout(False):
         try:
             resp = self.get('related_list',
                             {'id': dataset.ext['harvest'].remote_id})
         except:
             log.error('Unable to parse reuse for dataset %s', dataset.id)
             continue
         for details in resp['result']:
             reuse_url = details['url']
             urlhash = Reuse.hash_url(reuse_url)
             reuse, _ = Reuse.objects.get_or_create(urlhash=urlhash,
                                                    auto_save=False)
             reuse.url = reuse_url
             reuse.title = details['title']
             reuse.description = details['description']
             reuse.type = details['type']
             # reuse.url = details['url']
             reuse.image_url = details.get('image_url')
             reuse.featured = bool(details.get('featured', False))
             reuse.created_at = parse(details['created'])
             if details.get('owner_id'):
                 reuse.owner = self.get_harvested(User, details['owner_id'])
             if not dataset in reuse.datasets:
                 reuse.datasets.append(dataset)
                 for tag in dataset.tags:
                     if not tag in reuse.tags:
                         reuse.tags.append(tag)
             yield reuse
Beispiel #12
0
def nec_mergitur():
    datasets = (Dataset.objects(
        badges__kind=NECMERGITUR).visible().order_by('-metrics.followers'))
    return theme.render('nec_mergitur.html',
                        datasets=datasets,
                        badge=NECMERGITUR,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #13
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        org_id = str(self.organization.id)
        datasets, supplied_datasets, reuses = search.multiquery(
            search.SearchQuery(Dataset, sort='-created', organization=org_id, page_size=9),
            search.SearchQuery(Dataset, sort='-created', supplier=org_id, page_size=9),
            search.SearchQuery(Reuse, sort='-created', organization=org_id, page_size=9),

        )
        followers = FollowOrg.objects.followers(self.organization).order_by('follower.fullname')

        can_edit = EditOrganizationPermission(self.organization.id)
        context.update({
            'reuses': reuses,
            'datasets': datasets,
            'supplied_datasets': supplied_datasets,
            'followers': followers[:self.nb_followers],
            'can_edit': can_edit
        })
        if can_edit:
            context.update({
                'private_reuses': list(Reuse.objects(organization=self.object, private=True)),
                'private_datasets': list(Dataset.objects(organization=self.object, private=True)),
            })

        return context
Beispiel #14
0
def openfield16():
    datasets = (Dataset.objects(badges__kind=OPENFIELD16).visible()
                .order_by('-metrics.followers'))
    return theme.render('openfield16.html',
                        datasets=datasets,
                        badge=OPENFIELD16,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #15
0
    def get(self, level):
        '''List each zone for a given level with their datasets count'''
        level = GeoLevel.objects.get_or_404(id=level)
        features = []

        for zone in GeoZone.objects(level=level.id):
            # fetch nested levels IDs
            ids = GeoZone.objects(parents=zone.id).only('id').distinct('id')
            ids.append(zone.id)
            # Count datasets in zone
            nb_datasets = Dataset.objects(spatial__zones__in=ids).count()
            features.append({
                'id': zone.id,
                'type': 'Feature',
                'geometry': zone.geom,
                'properties': {
                    'name': _(zone.name),
                    'code': zone.code,
                    'level': zone.level,
                    'datasets': nb_datasets
                }
            })

        return {
            'type': 'FeatureCollection',
            'features': features
        }
Beispiel #16
0
 def get_value(self):
     ids = itertools.chain(*[
         [r.id for r in d.resources] for d in
         (Dataset.objects(organization=self.target).only('resources') or [])
     ])
     return int(Metrics.objects(object_id__in=ids, level='daily')
                       .sum('values.nb_uniq_visitors'))
Beispiel #17
0
    def get(self, level):
        '''List each zone for a given level with their datasets count'''
        level = GeoLevel.objects.get_or_404(id=level)
        features = []

        for zone in GeoZone.objects(level=level.id):
            # fetch nested levels IDs
            ids = GeoZone.objects(parents=zone.id).only('id').distinct('id')
            ids.append(zone.id)
            # Count datasets in zone
            nb_datasets = Dataset.objects(spatial__zones__in=ids).count()
            features.append({
                'id': zone.id,
                'type': 'Feature',
                'geometry': zone.geom,
                'properties': {
                    'name': _(zone.name),
                    'code': zone.code,
                    'level': zone.level,
                    'datasets': nb_datasets
                }
            })

        return {
            'type': 'FeatureCollection',
            'features': features
        }
Beispiel #18
0
def nec_mergitur():
    datasets = (Dataset.objects(badges__kind=NECMERGITUR).visible()
                .order_by('-metrics.followers'))
    return theme.render('nec_mergitur.html',
                        datasets=datasets,
                        badge=NECMERGITUR,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #19
0
 def get_context(self):
     context = super(OrganizationIssuesView, self).get_context()
     datasets = Dataset.objects(organization=self.organization)
     reuses = Reuse.objects(organization=self.organization)
     ids = [o.id for o in list(datasets) + list(reuses)]
     context['issues'] = Issue.objects(subject__in=ids)
     return context
Beispiel #20
0
 def check_availability(self):
     from udata.models import Dataset  # Circular imports.
     # Performances: only check the first 20 datasets for now.
     return chain(*[
         dataset.check_availability()
         for dataset in Dataset.objects(organization=self).visible()[:20]
     ])
Beispiel #21
0
 def get_value(self):
     ids = itertools.chain(
         *[[r.id for r in d.resources] for d in (Dataset.objects(
             organization=self.target).only('resources') or [])])
     return int(
         Metrics.objects(object_id__in=ids,
                         level='daily').sum('values.nb_uniq_visitors'))
Beispiel #22
0
def migrate(db):
    log.info('Processing resources.')

    datasets = Dataset.objects().no_cache().timeout(False)
    for dataset in datasets:
        save_res = False
        for resource in dataset.resources:
            if resource.url.startswith('https://static.data.gouv.fr'):
                parsed = urlparse(resource.url)
                fs_name = parsed.path.strip('/resource/')
                resource.fs_filename = fs_name
                save_res = True
        if save_res:
            try:
                dataset.save()
            except Exception as e:
                log.warning(e)
                pass

    log.info('Processing community resources.')

    community_resources = CommunityResource.objects().no_cache().timeout(False)
    for community_resource in community_resources:
        parsed = urlparse(community_resource.url)
        fs_name = parsed.path.strip('/resource/')
        community_resource.fs_filename = fs_name
        try:
            community_resource.save()
        except Exception as e:
            log.warning(e)
            pass

    log.info('Completed.')
Beispiel #23
0
def aggregate_datasets_daily(org, day):
    keys = ['datasets_{0}'.format(k) for k in KEYS]
    ids = [d.id for d in Dataset.objects(organization=org).only('id')]
    metrics = Metrics.objects(object_id__in=ids,
                              level='daily', date=day.isoformat())
    values = [int(metrics.sum('values.{0}'.format(k))) for k in KEYS]
    return Metrics.objects.update_daily(org, day, **dict(zip(keys, values)))
Beispiel #24
0
def purge_organizations(self):
    for organization in Organization.objects(deleted__ne=None):
        log.info(f'Purging organization {organization}')
        # Remove followers
        Follow.objects(following=organization).delete()
        # Remove activity
        Activity.objects(related_to=organization).delete()
        Activity.objects(organization=organization).delete()
        # Remove transfers
        Transfer.objects(recipient=organization).delete()
        Transfer.objects(owner=organization).delete()
        # Store datasets for later reindexation
        d_ids = [d.id for d in Dataset.objects(organization=organization)]
        # Remove organization's logo in all sizes
        if organization.logo.filename is not None:
            storage = storages.avatars
            storage.delete(organization.logo.filename)
            storage.delete(organization.logo.original)
            for key, value in organization.logo.thumbnails.items():
                storage.delete(value)
        # Remove
        organization.delete()
        # Reindex the datasets that were linked to the organization
        for id in d_ids:
            reindex(Dataset.__name__, str(id))
Beispiel #25
0
    def test_default(self):
        org = OrganizationFactory()
        source = HarvestSourceFactory(backend='factory', organization=org)
        with self.assert_emit(signals.before_harvest_job,
                              signals.after_harvest_job):
            self.action(source.slug)

        source.reload()
        self.assertEqual(len(HarvestJob.objects(source=source)), 1)

        job = source.get_last_job()

        self.assertEqual(job.status, 'done')
        self.assertEqual(job.errors, [])
        self.assertIsNotNone(job.started)
        self.assertIsNotNone(job.ended)
        self.assertEqual(len(job.items), COUNT)

        for item in job.items:
            self.assertEqual(item.status, 'done')
            self.assertEqual(item.errors, [])
            self.assertIsNotNone(item.started)
            self.assertIsNotNone(item.ended)
            self.assertIsNotNone(item.dataset)

            dataset = item.dataset
            self.assertIsNotNone(Dataset.objects(id=dataset.id).first())
            self.assertEqual(dataset.organization, org)
            self.assertIn('harvest:remote_id', dataset.extras)
            self.assertIn('harvest:last_update', dataset.extras)
            self.assertIn('harvest:source_id', dataset.extras)

        self.assertEqual(len(HarvestJob.objects), 1)
        self.assertEqual(len(Dataset.objects), COUNT)
Beispiel #26
0
    def test_default(self):
        org = OrganizationFactory()
        source = HarvestSourceFactory(backend='factory', organization=org)
        with assert_emit(signals.before_harvest_job,
                         signals.after_harvest_job):
            self.action(source.slug)

        source.reload()
        self.assertEqual(len(HarvestJob.objects(source=source)), 1)

        job = source.get_last_job()

        self.assertEqual(job.status, 'done')
        self.assertEqual(job.errors, [])
        self.assertIsNotNone(job.started)
        self.assertIsNotNone(job.ended)
        self.assertEqual(len(job.items), COUNT)

        for item in job.items:
            self.assertEqual(item.status, 'done')
            self.assertEqual(item.errors, [])
            self.assertIsNotNone(item.started)
            self.assertIsNotNone(item.ended)
            self.assertIsNotNone(item.dataset)

            dataset = item.dataset
            self.assertIsNotNone(Dataset.objects(id=dataset.id).first())
            self.assertEqual(dataset.organization, org)
            self.assertIn('harvest:remote_id', dataset.extras)
            self.assertIn('harvest:last_update', dataset.extras)
            self.assertIn('harvest:source_id', dataset.extras)

        self.assertEqual(len(HarvestJob.objects), 1)
        self.assertEqual(len(Dataset.objects), COUNT)
Beispiel #27
0
    def test_default(self):
        org = OrganizationFactory()
        source = HarvestSourceFactory(backend='factory', organization=org)
        with assert_emit(signals.before_harvest_job,
                         signals.after_harvest_job):
            self.action(source.slug)

        source.reload()
        assert len(HarvestJob.objects(source=source)) == 1

        job = source.get_last_job()

        assert job.status == 'done'
        assert job.errors == []
        assert job.started is not None
        assert job.ended is not None
        assert len(job.items) == COUNT

        for item in job.items:
            assert item.status == 'done'
            assert item.errors == []
            assert item.started is not None
            assert item.ended is not None
            assert item.dataset is not None

            dataset = item.dataset
            assert Dataset.objects(id=dataset.id).first() is not None
            assert dataset.organization == org
            assert 'harvest:remote_id' in dataset.extras
            assert 'harvest:last_update' in dataset.extras
            assert 'harvest:source_id' in dataset.extras

        assert len(HarvestJob.objects) == 1
        assert len(Dataset.objects) == COUNT
Beispiel #28
0
 def get_dataset(self, remote_id):
     '''Get or create a dataset given its remote ID (and its source)'''
     dataset = Dataset.objects(__raw__={
         'extras.harvest:remote_id': remote_id,
         'extras.harvest:domain': self.source.domain
     }).first()
     return dataset or Dataset()
Beispiel #29
0
    def test_default(self):
        org = OrganizationFactory()
        source = HarvestSourceFactory(backend='factory', organization=org)
        with assert_emit(signals.before_harvest_job,
                         signals.after_harvest_job):
            self.action(source.slug)

        source.reload()
        assert len(HarvestJob.objects(source=source)) == 1

        job = source.get_last_job()

        assert job.status == 'done'
        assert job.errors == []
        assert job.started is not None
        assert job.ended is not None
        assert len(job.items) == COUNT

        for item in job.items:
            assert item.status == 'done'
            assert item.errors == []
            assert item.started is not None
            assert item.ended is not None
            assert item.dataset is not None

            dataset = item.dataset
            assert Dataset.objects(id=dataset.id).first() is not None
            assert dataset.organization == org
            assert 'harvest:remote_id' in dataset.extras
            assert 'harvest:last_update' in dataset.extras
            assert 'harvest:source_id' in dataset.extras

        assert len(HarvestJob.objects) == 1
        assert len(Dataset.objects) == COUNT
Beispiel #30
0
 def check_availability(self):
     from udata.models import Dataset  # Circular imports.
     # Performances: only check the first 20 datasets for now.
     return chain(
         *[dataset.check_availability()
           for dataset in Dataset.objects(organization=self).visible()[:20]]
     )
Beispiel #31
0
def aggregate_datasets_daily(org, day):
    keys = ['datasets_{0}'.format(k) for k in KEYS]
    ids = [d.id for d in Dataset.objects(organization=org).only('id')]
    metrics = Metrics.objects(object_id__in=ids,
                              level='daily',
                              date=day.isoformat())
    values = [int(metrics.sum('values.{0}'.format(k))) for k in KEYS]
    return Metrics.objects.update_daily(org, day, **dict(zip(keys, values)))
Beispiel #32
0
 def get_value(self):
     ids = [
         d.id for d in (
             Dataset.objects(organization=self.target).only('id') or [])
     ]
     return int(
         Metrics.objects(object_id__in=ids,
                         level='daily').sum('values.nb_uniq_visitors'))
Beispiel #33
0
def purge_organizations(self):
    for organization in Organization.objects(deleted__ne=None):
        log.info('Purging organization "{0}"'.format(organization))
        # Remove followers
        Follow.objects(following=organization).delete()
        # Remove activity
        Activity.objects(related_to=organization).delete()
        Activity.objects(organization=organization).delete()
        # Remove metrics
        Metrics.objects(object_id=organization.id).delete()
        # Store datasets for later reindexation
        d_ids = [d.id for d in Dataset.objects(organization=organization)]
        # Remove
        organization.delete()
        # Reindex the datasets that were linked to the organization
        for dataset in Dataset.objects(id__in=d_ids):
            reindex(dataset)
Beispiel #34
0
def climate_change_challenge():
    partners = Organization.objects(slug__in=C3_PARTNERS)
    datasets = (Dataset.objects(
        badges__kind=C3).visible().order_by('-metrics.followers'))
    return theme.render('c3.html',
                        partners=partners,
                        datasets=datasets,
                        badge=C3,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #35
0
def dadosGovOld_API(org_slug, file_id):
    format = 'json' if request.args.get('format','xml').lower() == 'json' else 'xml'
    dataset = Dataset.objects(__raw__={'extras.harvest:remote_id': file_id}).first()
    if dataset:
        for resource in dataset.resources:
            if resource.format == format:
                return redirect(resource.url)
    #Everything else return 404               
    return abort(404)
Beispiel #36
0
def climate_change_challenge():
    partners = Organization.objects(slug__in=C3_PARTNERS)
    datasets = (Dataset.objects(badges__kind=C3).visible()
                .order_by('-metrics.followers'))
    return theme.render('c3.html',
                        partners=partners,
                        datasets=datasets,
                        badge=C3,
                        nb_displayed_datasets=NB_DISPLAYED_DATASETS)
Beispiel #37
0
    def mongo_search(cls, args):
        datasets = Dataset.objects(archived=None, deleted=None, private=False)
        datasets = DatasetApiParser.parse_filters(datasets, args)

        sort = cls.parse_sort(args['sort']) or ('$text_score' if args['q'] else
                                                None) or DEFAULT_SORTING
        offset = (args['page'] - 1) * args['page_size']
        return datasets.order_by(sort).skip(offset).limit(
            args['page_size']), datasets.count()
Beispiel #38
0
def c3_badges(filename):
    '''Toggle C3 badges from an organization list'''
    with open(filename, 'r') as titles:
        user = User.objects(first_name='Etalab', last_name='Bot').first()
        badge = DatasetBadge(kind=C3, created_by=user)
        for title in titles:
            title = title.decode('utf-8').strip(u'\n')
            if title.startswith(u'*'):
                continue
            slug = slugify.slugify(title.lower())
            dataset = (Dataset.objects(title=title).first()
                       or Dataset.objects(slug=slug).first())
            if dataset is None:
                log.info(u'{title} not found'.format(title=title))
            else:
                dataset.badges.append(badge)
                dataset.save()
    log.info('Done')
Beispiel #39
0
def render_territory(territory):
    if not current_app.config.get('ACTIVATE_TERRITORIES'):
        return abort(404)

    is_present_territory = territory.valid_at(date.today())

    # Retrieve the present territory if not presently valid.
    present_territory = None
    if not is_present_territory:
        present_territory = GeoZone.objects.valid_at(date.today()).get(
            level=territory.level, ancestors__contains=territory.id)

    # Only display dynamic datasets for present territories.
    base_datasets = []
    if is_present_territory:
        DATASETS = TERRITORY_DATASETS[territory.level_code]
        base_dataset_classes = sorted(DATASETS.values(), key=lambda a: a.order)
        base_datasets = [
            base_dataset_class(territory)
            for base_dataset_class in base_dataset_classes
        ]
    territories = [territory]

    # Deal with territories with ancestors.
    for ancestor_object in territory.ancestors_objects:
        territories.append(ancestor_object)

    # Retrieve all datasets then split between those optionaly owned
    # by an org for that zone and others. We need to know if the current
    # user has datasets for that zone in order to display a custom
    # message to ease the conversion.
    datasets = Dataset.objects(spatial__zones__in=territories).visible()
    # Retrieving datasets from old regions.
    territory_datasets = []
    other_datasets = []
    editable_datasets = []
    if datasets:
        for dataset in datasets:
            if (dataset.organization
                    and territory.id == dataset.organization.zone):
                territory_datasets.append(dataset)
            else:
                other_datasets.append(dataset)
            editable_datasets.append(current_user.is_authenticated
                                     and DatasetEditPermission(dataset).can())
    context = {
        'territory': territory,
        'present_territory': present_territory,
        'base_datasets': base_datasets,
        'other_datasets': other_datasets,
        'has_pertinent_datasets': any(editable_datasets),
        'territory_datasets': territory_datasets
    }
    template = 'territories/{level_name}.html'.format(
        level_name=territory.level_name)
    return theme.render(template, **context)
Beispiel #40
0
def resource_redirect(id):
    '''
    Redirect to the latest version of a resource given its identifier.
    '''
    dataset = Dataset.objects(resources__id=id).first()
    if dataset:
        resource = get_by(dataset.resources, 'id', id)
    else:
        resource = CommunityResource.objects(id=id).first()
    return redirect(resource.url.strip()) if resource else abort(404)
Beispiel #41
0
def rdf_catalog_format(org, format):
    if org.deleted:
        abort(410)
    params = multi_to_dict(request.args)
    page = int(params.get('page', 1))
    page_size = int(params.get('page_size', 100))
    datasets = Dataset.objects(organization=org).visible().paginate(
        page, page_size)
    catalog = build_org_catalog(org, datasets, format=format)
    return graph_response(catalog, format)
Beispiel #42
0
    def test_purge_organizations(self):
        org = Organization.objects.create(name='delete me', description='XXX')
        resources = [ResourceFactory() for _ in range(2)]
        dataset = DatasetFactory(resources=resources, organization=org)

        # Upload organization's logo
        file = create_test_image()
        user = AdminFactory()
        self.login(user)
        response = self.post(
            url_for('api.organization_logo', org=org),
            {'file': (file, 'test.png')},
            json=False)
        self.assert200(response)

        transfer_to_org = Transfer.objects.create(
            owner=user,
            recipient=org,
            subject=dataset,
            comment='comment',
        )
        transfer_from_org = Transfer.objects.create(
            owner=org,
            recipient=user,
            subject=dataset,
            comment='comment',
        )

        oauth_client = OAuth2Client.objects.create(
            name='test-client',
            owner=user,
            organization=org,
            redirect_uris=['https://test.org/callback'],
        )

        # Delete organization
        response = self.delete(url_for('api.organization', org=org))
        self.assert204(response)

        tasks.purge_organizations()

        oauth_client.reload()
        assert oauth_client.organization is None

        assert Transfer.objects.filter(id=transfer_from_org.id).count() == 0
        assert Transfer.objects.filter(id=transfer_to_org.id).count() == 0

        # Check organization's logo is deleted
        self.assertEqual(list(storages.avatars.list_files()), [])

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)
Beispiel #43
0
def attach(domain, filename):
    '''Attach existing dataset to their harvest remote id before harvesting.

    The expected csv file format is the following:

    - a column with header "local" and the local IDs or slugs
    - a column with header "remote" and the remote IDs

    The delimiter should be ";". columns order
    and extras columns does not matter
    '''
    count = 0
    errors = 0
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=b';', quotechar=b'"')
        for row in reader:
            try:
                dataset = Dataset.objects.get(id=ObjectId(row['local']))
            except:  # noqa  (Never stop on failure)
                log.warning('Unable to attach dataset : %s', row['local'])
                errors += 1
                continue

            # Detach previously attached dataset
            Dataset.objects(
                **{
                    'extras__harvest:domain': domain,
                    'extras__harvest:remote_id': row['remote']
                }).update(
                    **{
                        'unset__extras__harvest:domain': True,
                        'unset__extras__harvest:remote_id': True
                    })

            dataset.extras['harvest:domain'] = domain
            dataset.extras['harvest:remote_id'] = row['remote']
            dataset.last_modified = datetime.now()
            dataset.save()
            count += 1

    return AttachResult(count, errors)
Beispiel #44
0
def attach(domain, filename):
    '''Attach existing dataset to their harvest remote id before harvesting.

    The expected csv file format is the following:

    - a column with header "local" and the local IDs or slugs
    - a column with header "remote" and the remote IDs

    The delimiter should be ";". columns order
    and extras columns does not matter
    '''
    count = 0
    errors = 0
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile,
                                delimiter=b';',
                                quotechar=b'"')
        for row in reader:
            try:
                dataset = Dataset.objects.get(id=ObjectId(row['local']))
            except:  # noqa  (Never stop on failure)
                log.warning('Unable to attach dataset : %s', row['local'])
                errors += 1
                continue

            # Detach previously attached dataset
            Dataset.objects(**{
                'extras__harvest:domain': domain,
                'extras__harvest:remote_id': row['remote']
            }).update(**{
                'unset__extras__harvest:domain': True,
                'unset__extras__harvest:remote_id': True
            })

            dataset.extras['harvest:domain'] = domain
            dataset.extras['harvest:remote_id'] = row['remote']
            dataset.last_modified = datetime.now()
            dataset.save()
            count += 1

    return AttachResult(count, errors)
Beispiel #45
0
def explore():
    recent_datasets = list(Dataset.objects.visible().order_by('-date').limit(9))
    recent_reuses = list(Reuse.objects.order_by('-date').limit(9))
    featured_datasets = list(Dataset.objects(featured=True).visible().order_by('-date').limit(15))
    featured_reuses = list(Reuse.objects(featured=True).order_by('-date').limit(15))

    return render('explore.html',
        recent_datasets=recent_datasets,
        recent_reuses=recent_reuses,
        featured_datasets=featured_datasets,
        featured_reuses=featured_reuses,
    )
Beispiel #46
0
def purge_datasets():
    for dataset in Dataset.objects(deleted__ne=None):
        log.info('Purging dataset "{0}"'.format(dataset))
        # Remove followers
        FollowDataset.objects(following=dataset).delete()
        # Remove issues
        DatasetIssue.objects(subject=dataset).delete()
        # Remove activity
        Activity.objects(related_to=dataset).delete()
        # Remove metrics
        Metrics.objects(object_id=dataset.id).delete()
        dataset.delete()
Beispiel #47
0
def missing_datasets_warning(job_items, source):

    job_datasets = [item.dataset.id for item in job_items]

    domain_harvested_datasets = Dataset.objects(
        __raw__={
            'extras.harvest:domain': source.domain,
            'private': False,
            'deleted': None
        }).all()

    missing_datasets = []
    for dataset in domain_harvested_datasets:
        if dataset.id not in job_datasets:
            dataset.private = True
            missing_datasets.append(dataset)
            dataset.save()

    if missing_datasets:
        org_recipients = [
            member.user.email for member in source.organization.members
            if member.role == 'admin'
        ]
        admin_role = Role.objects.filter(name='admin').first()
        recipients = [
            user.email for user in User.objects.filter(roles=admin_role).all()
        ]

        #recipients = list(set(org_recipients + recipients))

        subject = 'Relatório harvesting dados.gov - {}.'.format(source)

        context = {
            'subject': subject,
            'harvester': source,
            'datasets': missing_datasets,
            'server': current_app.config.get('SERVER_NAME')
        }

        msg = Message(subject=subject,
                      sender='*****@*****.**',
                      recipients=org_recipients,
                      cc=['*****@*****.**'],
                      bcc=recipients)
        msg.body = theme.render('mail/harvester_warning.txt', **context)
        msg.html = theme.render('mail/harvester_warning.html', **context)

        mail = current_app.extensions.get('mail')
        try:
            mail.send(msg)
        except:
            pass
Beispiel #48
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        datasets = Dataset.objects(organization=self.organization).visible().order_by('-created')
        supplied_datasets = Dataset.objects(supplier=self.organization).visible().order_by('-created')
        reuses = Reuse.objects(organization=self.organization).visible().order_by('-created')
        followers = FollowOrg.objects.followers(self.organization).order_by('follower.fullname')

        can_edit = EditOrganizationPermission(self.organization)
        can_view = OrganizationPrivatePermission(self.organization)
        context.update({
            'reuses': reuses.paginate(1, self.page_size),
            'datasets': datasets.paginate(1, self.page_size),
            'supplied_datasets': supplied_datasets[:self.page_size],
            'followers': followers[:self.nb_followers],
            'can_edit': can_edit,
            'can_view': can_view,
            'private_reuses': list(Reuse.objects(organization=self.object).hidden()) if can_view else [],
            'private_datasets': list(Dataset.objects(organization=self.object).hidden()) if can_view else [],
        })

        return context
Beispiel #49
0
def purge_datasets(self):
    for dataset in Dataset.objects(deleted__ne=None):
        log.info('Purging dataset "{0}"'.format(dataset))
        # Remove followers
        FollowDataset.objects(following=dataset).delete()
        # Remove issues
        DatasetIssue.objects(subject=dataset).delete()
        # Remove activity
        Activity.objects(related_to=dataset).delete()
        # Remove metrics
        Metrics.objects(object_id=dataset.id).delete()
        # Remove
        dataset.delete()
Beispiel #50
0
    def handle_downloads(self, row, day):
        if 'url' in row:
            try:
                hashed_url = hash_url(row['url'])
                data = (
                    Dataset.objects(resources__urlhash=hashed_url).first()
                    or
                    CommunityResource.objects(urlhash=hashed_url).first()
                )
                if isinstance(data, Dataset):
                    dataset = data
                    resource = get_by(dataset.resources, 'urlhash', hashed_url)
                    log.debug('Found resource download: %s', resource.url)
                    self.count(resource, day, row)
                    metric = ResourceViews(resource)
                    metric.compute()
                    # Use the MongoDB positionnal operator ($)
                    cmd = 'set__resources__S__metrics__{0}'.format(metric.name)
                    qs = Dataset.objects(id=dataset.id,
                                         resources__id=resource.id)
                    qs.update(**{cmd: metric.value})
                    if dataset.organization:
                        OrgResourcesDownloads(dataset.organization).compute()
                elif isinstance(data, CommunityResource):
                    resource = data
                    log.debug('Found community resource download: %s',
                              resource.url)
                    self.count(resource, day, row)
                    metric = CommunityResourceViews(resource)
                    metric.compute()
                    resource.metrics[metric.name] = metric.value
                    resource.save()

            except:
                log.exception('Unable to count download for %s', row['url'])
        if 'subtable' in row:
            for subrow in row['subtable']:
                self.handle_downloads(subrow, day)
Beispiel #51
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        can_edit = EditOrganizationPermission(self.organization)
        can_view = OrganizationPrivatePermission(self.organization)

        if self.organization.deleted and not can_view.can():
            abort(410)

        datasets = Dataset.objects(organization=self.organization).visible()
        reuses = Reuse.objects(organization=self.organization).visible()
        followers = FollowOrg.objects.followers(self.organization).order_by("follower.fullname")
        context.update(
            {
                "reuses": reuses.paginate(1, self.page_size),
                "datasets": datasets.paginate(1, self.page_size),
                "followers": followers,
                "can_edit": can_edit,
                "can_view": can_view,
                "private_reuses": (list(Reuse.objects(organization=self.object).hidden()) if can_view else []),
                "private_datasets": (list(Dataset.objects(organization=self.object).hidden()) if can_view else []),
            }
        )
        return context
    def test_purge_organizations(self):
        with self.autoindex():
            org = Organization.objects.create(
                name='delete me', deleted='2016-01-01', description='XXX')
            resources = [ResourceFactory() for _ in range(2)]
            dataset = DatasetFactory(resources=resources, organization=org)

        tasks.purge_organizations()

        dataset = Dataset.objects(id=dataset.id).first()
        self.assertIsNone(dataset.organization)

        organization = Organization.objects(name='delete me').first()
        self.assertIsNone(organization)

        indexed_dataset = DatasetSearch.get(id=dataset.id,
                                            using=es.client,
                                            index=es.index_name)
        self.assertIsNone(indexed_dataset.organization)
Beispiel #53
0
    def serialize(cls, reuse):
        """By default use the ``to_dict`` method

        and exclude ``_id``, ``_cls`` and ``owner`` fields.
        """
        datasets = Dataset.objects(id__in=[r.id for r in reuse.datasets])
        datasets = list(datasets.only('id', 'title').no_dereference())
        organization = None
        owner = None
        if reuse.organization:
            organization = Organization.objects(id=reuse.organization.id).first()
        elif reuse.owner:
            owner = User.objects(id=reuse.owner.id).first()
        return {
            'title': reuse.title,
            'description': reuse.description,
            'url': reuse.url,
            'organization': str(organization.id) if organization else None,
            'owner': str(owner.id) if owner else None,
            'type': reuse.type,
            'tags': reuse.tags,
            'tag_suggest': reuse.tags,
            'badges': [badge.kind for badge in reuse.badges],
            'created': reuse.created_at.strftime('%Y-%m-%dT%H:%M:%S'),
            'last_modified': reuse.last_modified.strftime('%Y-%m-%dT%H:%M:%S'),
            'dataset': [{
                'id': str(d.id),
                'title': d.title
            } for d in datasets],
            'metrics': reuse.metrics,
            'featured': reuse.featured,
            'extras': reuse.extras,
            'reuse_suggest': {
                'input': cls.completer_tokenize(reuse.title) + [reuse.id],
                'output': str(reuse.id),
                'payload': {
                    'title': reuse.title,
                    'slug': reuse.slug,
                    'image_url': reuse.image(40, external=True),
                },
            },
        }
Beispiel #54
0
    def get_context(self):
        context = super(OrganizationDetailView, self).get_context()

        org_id = str(self.organization.id)
        datasets, supplied_datasets, reuses = multiquery(
            SearchQuery(DatasetSearch, sort='-created', organization=org_id, page_size=9),
            SearchQuery(DatasetSearch, sort='-created', supplier=org_id, page_size=9),
            SearchQuery(ReuseSearch, sort='-created', organization=org_id, page_size=9),
        )

        context.update({
            'reuses': reuses,
            'datasets': datasets,
            'supplied_datasets': supplied_datasets,
            'private_reuses': list(Reuse.objects(organization=self.object, private=True)),
            'private_datasets': list(Dataset.objects(organization=self.object, private=True)),
            'can_edit': EditOrganizationPermission(self.organization.id)
        })

        return context
Beispiel #55
0
    def test_attach_does_not_duplicate(self):
        attached_datasets = []
        for i in range(2):
            dataset = DatasetFactory.build()
            dataset.extras['harvest:domain'] = 'test.org'
            dataset.extras['harvest:remote_id'] = str(i)
            dataset.last_modified = datetime.now()
            dataset.save()
            attached_datasets.append(dataset)

        datasets = DatasetFactory.create_batch(3)

        with NamedTemporaryFile() as csvfile:
            writer = csv.DictWriter(csvfile,
                                    fieldnames=['local', 'remote'],
                                    delimiter=b';',
                                    quotechar=b'"')

            writer.writeheader()
            for index, dataset in enumerate(datasets):
                writer.writerow({
                    'local': str(dataset.id),
                    'remote': str(index)
                })
            csvfile.flush()

            result = actions.attach('test.org', csvfile.name)

        dbcount = Dataset.objects(**{
            'extras__harvest:remote_id__exists': True
        }).count()
        self.assertEqual(result.success, len(datasets))
        self.assertEqual(dbcount, result.success)
        for index, dataset in enumerate(datasets):
            dataset.reload()
            self.assertEqual(dataset.extras['harvest:domain'], 'test.org')
            self.assertEqual(dataset.extras['harvest:remote_id'], str(index))
Beispiel #56
0
 def get_value(self):
     return Dataset.objects(owner=self.user).visible().count()