Ejemplo n.º 1
0
 def crawl(self):
     url = urljoin(self.host, '/ticket/all_closed/?format=json')
     watchlist = Watchlist.by_foreign_id(url, {
         'label': 'Investigative Dashboard Requests'
     })
     Permission.grant_foreign(watchlist, 'idashboard:occrp_staff',
                              True, False)
     existing_entities = []
     previous_terms = watchlist.terms
     updated_terms = set()
     db.session.flush()
     for endpoint in ['all_closed', 'all_open']:
         url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
         data = self.session.get(url).json()
         for req in data.get('paginator', {}).get('object_list'):
             category = REQUEST_TYPES.get(req.get('ticket_type'))
             if category is None:
                 continue
             ent = Entity.by_foreign_id(str(req.get('id')), watchlist, {
                 'name': req.get('name'),
                 'category': category,
                 'data': req,
                 'selectors': [req.get('name')]
             })
             updated_terms.update(ent.terms)
             existing_entities.append(ent.id)
             log.info("  # %s (%s)", ent.name, ent.category)
     watchlist.delete_entities(spare=existing_entities)
     terms = previous_terms.symmetric_difference(updated_terms)
     self.emit_watchlist(watchlist, terms)
Ejemplo n.º 2
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        collection = Collection.by_foreign_id(url, {
            'label': 'Investigative Dashboard Requests'
        })
        Permission.grant_foreign(collection, 'idashboard:occrp_staff',
                                 True, False)
        existing_entities = []
        terms = set()
        db.session.flush()
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()
            print url
            continue

            for req in data.get('paginator', {}).get('object_list'):
                ent = self.update_entity(req, collection)
                if ent is not None:
                    terms.update(ent.terms)
                    existing_entities.append(ent.id)
                    log.info("  # %s", ent.name)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 3
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        collection = Collection.by_foreign_id(url, {
            'label': 'Investigative Dashboard Requests'
        })
        Permission.grant_foreign(collection, 'idashboard:occrp_staff',
                                 True, False)
        existing_entities = []
        terms = set()
        db.session.flush()
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()
            for req in data.get('paginator', {}).get('object_list'):
                category = REQUEST_TYPES.get(req.get('ticket_type'))
                if category is None:
                    continue
                ent = Entity.by_foreign_id(str(req.get('id')), collection, {
                    'name': req.get('name'),
                    'category': category,
                    'data': req,
                    'selectors': [req.get('name')]
                })
                terms.update(ent.terms)
                existing_entities.append(ent.id)
                log.info("  # %s (%s)", ent.name, ent.category)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 4
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)
        source_name = source.get('source') or source.get('source_id')
        label = '%s - %s' % (source.get('publisher'), source_name)
        collection = self.find_collection(url, {
            'label': label
        })
        Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", collection.label)
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            data = {
                'identifiers': [{
                    'scheme': 'opennames:%s' % source.get('source_id'),
                    'identifier': entity.get('uid')
                }],
                'other_names': [],
                'name': entity.get('name'),
                '$schema': SCHEMA.get(entity.get('type'),
                                      '/entity/entity.json#')
            }
            for on in entity.get('other_names', []):
                on['name'] = on.pop('other_name', None)
                data['other_names'].append(on)
            self.emit_entity(collection, data)
        self.emit_collection(collection)
Ejemplo n.º 5
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)
        source_name = source.get('source') or source.get('source_id')
        label = '%s - %s' % (source.get('publisher'), source_name)
        collection = self.find_collection(url, {'label': label})
        Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", collection.label)
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            data = {
                'identifiers': [{
                    'scheme':
                    'opennames:%s' % source.get('source_id'),
                    'identifier':
                    entity.get('uid')
                }],
                'other_names': [],
                'name':
                entity.get('name'),
                '$schema':
                SCHEMA.get(entity.get('type'), '/entity/entity.json#')
            }
            for on in entity.get('other_names', []):
                on['name'] = on.pop('other_name', None)
                data['other_names'].append(on)
            self.emit_entity(collection, data)
        self.emit_collection(collection)
Ejemplo n.º 6
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        collection = Collection.by_foreign_id(
            url, {'label': 'Investigative Dashboard Requests'})
        Permission.grant_foreign(collection, 'idashboard:occrp_staff', True,
                                 False)
        existing_entities = []
        terms = set()
        db.session.flush()
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()
            print url
            continue

            for req in data.get('paginator', {}).get('object_list'):
                ent = self.update_entity(req, collection)
                if ent is not None:
                    terms.update(ent.terms)
                    existing_entities.append(ent.id)
                    log.info("  # %s", ent.name)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 7
0
    def crawl_collection(self, collection):
        if not len(collection.get('subjects', [])):
            return
        url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
        watchlist = Watchlist.by_foreign_id(url, {
            'label': collection.get('title')
        })
        res = requests.get('%s/permissions' % url, headers=self.HEADERS)
        for perm in res.json().get('results', []):
            Permission.grant_foreign(watchlist, perm.get('role'),
                                     perm.get('read'), perm.get('write'))

        log.info(" > Spindle collection: %s", watchlist.label)
        res = requests.get('%s/entities' % url, headers=self.HEADERS)
        previous_terms = watchlist.terms
        updated_terms = set()
        existing_entities = []
        for entity in res.json().get('results', []):
            if entity.get('name') is None:
                continue
            aliases = [on.get('alias') for on in entity.get('other_names', [])]
            ent = Entity.by_foreign_id(entity.get('id'), watchlist, {
                'name': entity.get('name'),
                'category': SCHEMATA.get(entity.get('$schema'), OTHER),
                'data': entity,
                'selectors': aliases
            })
            updated_terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)
        watchlist.delete_entities(spare=existing_entities)
        terms = previous_terms.symmetric_difference(updated_terms)
        self.emit_watchlist(watchlist, terms)
Ejemplo n.º 8
0
Archivo: spindle.py Proyecto: 01-/aleph
    def crawl_collection(self, collection):
        if not len(collection.get('subjects', [])):
            return
        url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
        collection = Collection.by_foreign_id(url, {
            'label': collection.get('title')
        })
        res = requests.get('%s/permissions' % url, headers=self.HEADERS)
        for perm in res.json().get('results', []):
            Permission.grant_foreign(collection, perm.get('role'),
                                     perm.get('read'), perm.get('write'))

        log.info(" > Spindle collection: %s", collection.label)
        res = requests.get('%s/entities' % url, headers=self.HEADERS)
        terms = set()
        existing_entities = []
        for entity in res.json().get('results', []):
            if entity.get('name') is None:
                continue
            aliases = [on.get('alias') for on in entity.get('other_names', [])]
            ent = Entity.by_foreign_id(entity.get('id'), collection, {
                'name': entity.get('name'),
                'category': SCHEMATA.get(entity.get('$schema'), OTHER),
                'data': entity,
                'selectors': aliases
            })
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 9
0
    def crawl_collection(self, collection):
        if not len(collection.get('subjects', [])):
            return
        url = urljoin(self.URL, '/api/collections/%s' % collection.get('id'))
        collection = Collection.by_foreign_id(url, {
            'label': collection.get('title')
        })
        res = requests.get('%s/permissions' % url, headers=self.HEADERS)
        for perm in res.json().get('results', []):
            Permission.grant_foreign(collection, perm.get('role'),
                                     perm.get('read'), perm.get('write'))

        log.info(" > Spindle collection: %s", collection.label)
        res = requests.get('%s/entities' % url, headers=self.HEADERS)
        terms = set()
        existing_entities = []
        for entity in res.json().get('results', []):
            if entity.get('name') is None:
                continue
            entity['$schema'] = SCHEMATA.get(entity.get('$schema'), OTHER)
            if 'jurisdiction_code' in entity:
                entity['jurisdiction_code'] = \
                    entity['jurisdiction_code'].lower()
            entity.pop('members', None)
            entity.pop('memberships', None)
            entity.pop('assets', None)
            entity.pop('owners', None)
            entity.pop('family_first', None)
            entity.pop('family_second', None)
            entity.pop('social_first', None)
            entity.pop('social_second', None)

            for date_field in ['birth_date']:
                if date_field in entity and 'T' in entity[date_field]:
                    entity[date_field], _ = entity[date_field].split('T', 1)

            for on in entity.get('other_names', []):
                name = on.pop('alias', None)
                if name is not None:
                    on['name'] = name

            entity['identifiers'] = [{
                'scheme': 'spindle',
                'identifier': entity.pop('id', None)
            }]
            ent = Entity.save(entity, collection_id=collection.id, merge=True)
            db.session.flush()
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s", ent.name)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 10
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        coll = self.find_collection(
            url, {'label': 'Investigative Dashboard Requests'})
        Permission.grant_foreign(coll, 'idashboard:occrp_staff', True, False)
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()

            for req in data.get('paginator', {}).get('object_list'):
                # TODO: get the ID API fixed.
                self.update_entity(req, coll)

        self.emit_collection(coll)
Ejemplo n.º 11
0
    def crawl(self):
        url = urljoin(self.host, '/ticket/all_closed/?format=json')
        coll = self.find_collection(url, {
            'label': 'Investigative Dashboard Requests'
        })
        Permission.grant_foreign(coll, 'idashboard:occrp_staff',
                                 True, False)
        for endpoint in ['all_closed', 'all_open']:
            url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint)
            data = self.session.get(url).json()

            for req in data.get('paginator', {}).get('object_list'):
                # TODO: get the ID API fixed.
                self.update_entity(req, coll)

        self.emit_collection(coll)
Ejemplo n.º 12
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)
        source_name = source.get('source') or source.get('source_id')
        label = '%s - %s' % (source.get('publisher'), source_name)
        collection = Collection.by_foreign_id(url, {'label': label})
        Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", collection.label)
        terms = set()
        existing_entities = []
        db.session.flush()
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            data = {
                'identifiers': [{
                    'scheme':
                    'opennames:%s' % source.get('source_id'),
                    'identifier':
                    entity.get('uid')
                }],
                'other_names': [],
                'name':
                entity.get('name'),
                '$schema':
                SCHEMA.get(entity.get('type'), '/entity/entity.json#')
            }
            for on in entity.get('other_names', []):
                on['name'] = on.pop('other_name', None)
                data['other_names'].append(on)

            ent = Entity.save(data, collection_id=collection.id, merge=True)
            db.session.flush()
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s", ent.name)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()

        self.emit_collection(collection, terms)
Ejemplo n.º 13
0
    def crawl_item(self, item, sources, source):
        source_data = item.meta.get('source', {})
        source_id = source_data.pop('foreign_id', source)
        if source_id is None:
            raise ValueError("No foreign_id for source given: %r" % item)
        if source_id not in sources:
            label = source_data.get('label', source_id)
            sources[source_id] = self.create_source(foreign_id=source_id,
                                                    label=label)
            if source_data.get('public'):
                Permission.grant_foreign(sources[source_id], Role.SYSTEM_GUEST,
                                         True, False)
            if source_data.get('users'):
                Permission.grant_foreign(sources[source_id], Role.SYSTEM_USER,
                                         True, False)

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        self.emit_file(sources[source_id], meta, item.data_path)
Ejemplo n.º 14
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)
        source_name = source.get('source') or source.get('source_id')
        label = '%s - %s' % (source.get('publisher'), source_name)
        collection = Collection.by_foreign_id(url, {
            'label': label
        })
        Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", collection.label)
        terms = set()
        existing_entities = []
        db.session.flush()
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            data = {
                'identifiers': [{
                    'scheme': 'opennames:%s' % source.get('source_id'),
                    'identifier': entity.get('uid')
                }],
                'other_names': [],
                'name': entity.get('name'),
                '$schema': SCHEMA.get(entity.get('type'),
                                      '/entity/entity.json#')
            }
            for on in entity.get('other_names', []):
                on['name'] = on.pop('other_name', None)
                data['other_names'].append(on)

            ent = Entity.save(data, collection_id=collection.id, merge=True)
            db.session.flush()
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s", ent.name)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()

        self.emit_collection(collection, terms)
Ejemplo n.º 15
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)
        source_name = source.get('source') or source.get('source_id')
        label = '%s - %s' % (source.get('publisher'), source_name)
        collection = Collection.by_foreign_id(url, {
            'label': label
        })
        Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", collection.label)
        terms = set()
        existing_entities = []
        db.session.flush()
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            if entity.get('name') is None:
                continue
            selectors = []
            for on in entity.get('other_names', []):
                selectors.append(on.get('other_name'))

            for iden in entity.get('identities', []):
                if iden.get('number'):
                    selectors.append(iden.get('number'))

            ent = Entity.by_foreign_id(entity.get('uid'), collection, {
                'name': entity.get('name'),
                'category': CATEGORIES.get(entity.get('type'), OTHER),
                'data': entity,
                'selectors': selectors
            })
            terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)

        for entity in collection.entities:
            if entity.id not in existing_entities:
                entity.delete()
        self.emit_collection(collection, terms)
Ejemplo n.º 16
0
Archivo: mf.py Proyecto: 01-/aleph
    def crawl_item(self, item, sources, source):
        source_data = item.meta.get('source', {})
        source_id = source_data.pop('foreign_id', source)
        if source_id is None:
            raise ValueError("No foreign_id for source given: %r" % item)
        if source_id not in sources:
            label = source_data.get('label', source_id)
            sources[source_id] = self.create_source(foreign_id=source_id,
                                                    label=label)
            if source_data.get('public'):
                Permission.grant_foreign(sources[source_id],
                                         Role.SYSTEM_GUEST,
                                         True, False)
            if source_data.get('users'):
                Permission.grant_foreign(sources[source_id],
                                         Role.SYSTEM_USER,
                                         True, False)

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        self.emit_file(sources[source_id], meta, item.data_path)
Ejemplo n.º 17
0
    def crawl_item(self, item):
        coll_data = item.meta.get('source', {})
        coll_fk = coll_data.pop('foreign_id')
        if coll_fk is None:
            raise ValueError("No foreign_id for collection given: %r" % item)
        if coll_fk not in self.collections:
            label = coll_data.get('label', coll_fk)
            self.collections[coll_fk] = Collection.create({
                'foreign_id': coll_fk,
                'label': label
            })
            if coll_data.get('public'):
                Permission.grant_foreign(self.collections[coll_fk],
                                         Role.SYSTEM_GUEST,
                                         True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.collections[coll_fk].id, meta,
                    item.data_path, move=False)
Ejemplo n.º 18
0
    def crawl_item(self, item):
        coll_data = item.meta.get('source', {})
        coll_fk = coll_data.pop('foreign_id')
        if coll_fk is None:
            raise ValueError("No foreign_id for collection given: %r" % item)
        if coll_fk not in self.collections:
            label = coll_data.get('label', coll_fk)
            self.collections[coll_fk] = Collection.create({
                'foreign_id': coll_fk,
                'label': label,
                'managed': True
            })
            if coll_data.get('public'):
                Permission.grant_foreign(self.collections[coll_fk],
                                         Role.SYSTEM_GUEST,
                                         True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.collections[coll_fk].id, meta,
                    item.data_path, move=False)
Ejemplo n.º 19
0
    def crawl_item(self, item, source):
        source_data = item.meta.get('source', {})
        source_fk = source_data.pop('foreign_id', source)
        if source_fk is None:
            raise ValueError("No foreign_id for source given: %r" % item)
        if source_fk not in self.sources:
            label = source_data.get('label', source_fk)
            self.sources[source_fk] = Source.create({
                'foreign_id': source_fk,
                'label': label
            })
            if source_data.get('public'):
                Permission.grant_foreign(self.sources[source_fk],
                                         Role.SYSTEM_GUEST, True, False)
            db.session.commit()

        log.info('Import: %r', item.identifier)
        meta = self.normalize_metadata(item)
        ingest_file(self.sources[source_fk].id,
                    meta,
                    item.data_path,
                    move=False)
Ejemplo n.º 20
0
    def crawl_source(self, source):
        if source.get('source_id') in IGNORE_SOURCES:
            return

        json_file = source.get('data', {}).get('json')
        url = urljoin(JSON_PATH, json_file)

        watchlist = Watchlist.by_foreign_id(url, {
            'label': source.get('source_id')
        })
        Permission.grant_foreign(watchlist, Role.SYSTEM_GUEST, True, False)
        log.info(" > OpenNames collection: %s", watchlist.label)
        previous_terms = watchlist.terms
        updated_terms = set()
        existing_entities = []
        db.session.flush()
        entities = requests.get(url).json().get('entities', [])
        for entity in entities:
            if entity.get('name') is None:
                continue
            selectors = []
            for on in entity.get('other_names', []):
                selectors.append(on.get('other_name'))
            for iden in entity.get('identities', []):
                if iden.get('number'):
                    selectors.append(iden.get('number'))
            ent = Entity.by_foreign_id(entity.get('uid'), watchlist, {
                'name': entity.get('name'),
                'category': CATEGORIES.get(entity.get('type'), OTHER),
                'data': entity,
                'selectors': selectors
            })
            updated_terms.update(ent.terms)
            existing_entities.append(ent.id)
            log.info("  # %s (%s)", ent.name, ent.category)
        watchlist.delete_entities(spare=existing_entities)
        terms = previous_terms.symmetric_difference(updated_terms)
        self.emit_watchlist(watchlist, terms)