def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') watchlist = Watchlist.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(watchlist, 'idashboard:occrp_staff', True, False) existing_entities = [] previous_terms = watchlist.terms updated_terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): category = REQUEST_TYPES.get(req.get('ticket_type')) if category is None: continue ent = Entity.by_foreign_id(str(req.get('id')), watchlist, { 'name': req.get('name'), 'category': category, 'data': req, 'selectors': [req.get('name')] }) updated_terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) watchlist.delete_entities(spare=existing_entities) terms = previous_terms.symmetric_difference(updated_terms) self.emit_watchlist(watchlist, terms)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() print url continue for req in data.get('paginator', {}).get('object_list'): ent = self.update_entity(req, collection) if ent is not None: terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): category = REQUEST_TYPES.get(req.get('ticket_type')) if category is None: continue ent = Entity.by_foreign_id(str(req.get('id')), collection, { 'name': req.get('name'), 'category': category, 'data': req, 'selectors': [req.get('name')] }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) source_name = source.get('source') or source.get('source_id') label = '%s - %s' % (source.get('publisher'), source_name) collection = self.find_collection(url, { 'label': label }) Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", collection.label) entities = requests.get(url).json().get('entities', []) for entity in entities: data = { 'identifiers': [{ 'scheme': 'opennames:%s' % source.get('source_id'), 'identifier': entity.get('uid') }], 'other_names': [], 'name': entity.get('name'), '$schema': SCHEMA.get(entity.get('type'), '/entity/entity.json#') } for on in entity.get('other_names', []): on['name'] = on.pop('other_name', None) data['other_names'].append(on) self.emit_entity(collection, data) self.emit_collection(collection)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) source_name = source.get('source') or source.get('source_id') label = '%s - %s' % (source.get('publisher'), source_name) collection = self.find_collection(url, {'label': label}) Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", collection.label) entities = requests.get(url).json().get('entities', []) for entity in entities: data = { 'identifiers': [{ 'scheme': 'opennames:%s' % source.get('source_id'), 'identifier': entity.get('uid') }], 'other_names': [], 'name': entity.get('name'), '$schema': SCHEMA.get(entity.get('type'), '/entity/entity.json#') } for on in entity.get('other_names', []): on['name'] = on.pop('other_name', None) data['other_names'].append(on) self.emit_entity(collection, data) self.emit_collection(collection)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id( url, {'label': 'Investigative Dashboard Requests'}) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() print url continue for req in data.get('paginator', {}).get('object_list'): ent = self.update_entity(req, collection) if ent is not None: terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) watchlist = Watchlist.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(watchlist, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", watchlist.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) previous_terms = watchlist.terms updated_terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue aliases = [on.get('alias') for on in entity.get('other_names', [])] ent = Entity.by_foreign_id(entity.get('id'), watchlist, { 'name': entity.get('name'), 'category': SCHEMATA.get(entity.get('$schema'), OTHER), 'data': entity, 'selectors': aliases }) updated_terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) watchlist.delete_entities(spare=existing_entities) terms = previous_terms.symmetric_difference(updated_terms) self.emit_watchlist(watchlist, terms)
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) collection = Collection.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(collection, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", collection.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue aliases = [on.get('alias') for on in entity.get('other_names', [])] ent = Entity.by_foreign_id(entity.get('id'), collection, { 'name': entity.get('name'), 'category': SCHEMATA.get(entity.get('$schema'), OTHER), 'data': entity, 'selectors': aliases }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) collection = Collection.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(collection, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", collection.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue entity['$schema'] = SCHEMATA.get(entity.get('$schema'), OTHER) if 'jurisdiction_code' in entity: entity['jurisdiction_code'] = \ entity['jurisdiction_code'].lower() entity.pop('members', None) entity.pop('memberships', None) entity.pop('assets', None) entity.pop('owners', None) entity.pop('family_first', None) entity.pop('family_second', None) entity.pop('social_first', None) entity.pop('social_second', None) for date_field in ['birth_date']: if date_field in entity and 'T' in entity[date_field]: entity[date_field], _ = entity[date_field].split('T', 1) for on in entity.get('other_names', []): name = on.pop('alias', None) if name is not None: on['name'] = name entity['identifiers'] = [{ 'scheme': 'spindle', 'identifier': entity.pop('id', None) }] ent = Entity.save(entity, collection_id=collection.id, merge=True) db.session.flush() terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') coll = self.find_collection( url, {'label': 'Investigative Dashboard Requests'}) Permission.grant_foreign(coll, 'idashboard:occrp_staff', True, False) for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): # TODO: get the ID API fixed. self.update_entity(req, coll) self.emit_collection(coll)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') coll = self.find_collection(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(coll, 'idashboard:occrp_staff', True, False) for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): # TODO: get the ID API fixed. self.update_entity(req, coll) self.emit_collection(coll)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) source_name = source.get('source') or source.get('source_id') label = '%s - %s' % (source.get('publisher'), source_name) collection = Collection.by_foreign_id(url, {'label': label}) Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", collection.label) terms = set() existing_entities = [] db.session.flush() entities = requests.get(url).json().get('entities', []) for entity in entities: data = { 'identifiers': [{ 'scheme': 'opennames:%s' % source.get('source_id'), 'identifier': entity.get('uid') }], 'other_names': [], 'name': entity.get('name'), '$schema': SCHEMA.get(entity.get('type'), '/entity/entity.json#') } for on in entity.get('other_names', []): on['name'] = on.pop('other_name', None) data['other_names'].append(on) ent = Entity.save(data, collection_id=collection.id, merge=True) db.session.flush() terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_item(self, item, sources, source): source_data = item.meta.get('source', {}) source_id = source_data.pop('foreign_id', source) if source_id is None: raise ValueError("No foreign_id for source given: %r" % item) if source_id not in sources: label = source_data.get('label', source_id) sources[source_id] = self.create_source(foreign_id=source_id, label=label) if source_data.get('public'): Permission.grant_foreign(sources[source_id], Role.SYSTEM_GUEST, True, False) if source_data.get('users'): Permission.grant_foreign(sources[source_id], Role.SYSTEM_USER, True, False) log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) self.emit_file(sources[source_id], meta, item.data_path)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) source_name = source.get('source') or source.get('source_id') label = '%s - %s' % (source.get('publisher'), source_name) collection = Collection.by_foreign_id(url, { 'label': label }) Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", collection.label) terms = set() existing_entities = [] db.session.flush() entities = requests.get(url).json().get('entities', []) for entity in entities: data = { 'identifiers': [{ 'scheme': 'opennames:%s' % source.get('source_id'), 'identifier': entity.get('uid') }], 'other_names': [], 'name': entity.get('name'), '$schema': SCHEMA.get(entity.get('type'), '/entity/entity.json#') } for on in entity.get('other_names', []): on['name'] = on.pop('other_name', None) data['other_names'].append(on) ent = Entity.save(data, collection_id=collection.id, merge=True) db.session.flush() terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) source_name = source.get('source') or source.get('source_id') label = '%s - %s' % (source.get('publisher'), source_name) collection = Collection.by_foreign_id(url, { 'label': label }) Permission.grant_foreign(collection, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", collection.label) terms = set() existing_entities = [] db.session.flush() entities = requests.get(url).json().get('entities', []) for entity in entities: if entity.get('name') is None: continue selectors = [] for on in entity.get('other_names', []): selectors.append(on.get('other_name')) for iden in entity.get('identities', []): if iden.get('number'): selectors.append(iden.get('number')) ent = Entity.by_foreign_id(entity.get('uid'), collection, { 'name': entity.get('name'), 'category': CATEGORIES.get(entity.get('type'), OTHER), 'data': entity, 'selectors': selectors }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def crawl_item(self, item, sources, source): source_data = item.meta.get('source', {}) source_id = source_data.pop('foreign_id', source) if source_id is None: raise ValueError("No foreign_id for source given: %r" % item) if source_id not in sources: label = source_data.get('label', source_id) sources[source_id] = self.create_source(foreign_id=source_id, label=label) if source_data.get('public'): Permission.grant_foreign(sources[source_id], Role.SYSTEM_GUEST, True, False) if source_data.get('users'): Permission.grant_foreign(sources[source_id], Role.SYSTEM_USER, True, False) log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) self.emit_file(sources[source_id], meta, item.data_path)
def crawl_item(self, item): coll_data = item.meta.get('source', {}) coll_fk = coll_data.pop('foreign_id') if coll_fk is None: raise ValueError("No foreign_id for collection given: %r" % item) if coll_fk not in self.collections: label = coll_data.get('label', coll_fk) self.collections[coll_fk] = Collection.create({ 'foreign_id': coll_fk, 'label': label }) if coll_data.get('public'): Permission.grant_foreign(self.collections[coll_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.collections[coll_fk].id, meta, item.data_path, move=False)
def crawl_item(self, item): coll_data = item.meta.get('source', {}) coll_fk = coll_data.pop('foreign_id') if coll_fk is None: raise ValueError("No foreign_id for collection given: %r" % item) if coll_fk not in self.collections: label = coll_data.get('label', coll_fk) self.collections[coll_fk] = Collection.create({ 'foreign_id': coll_fk, 'label': label, 'managed': True }) if coll_data.get('public'): Permission.grant_foreign(self.collections[coll_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.collections[coll_fk].id, meta, item.data_path, move=False)
def crawl_item(self, item, source): source_data = item.meta.get('source', {}) source_fk = source_data.pop('foreign_id', source) if source_fk is None: raise ValueError("No foreign_id for source given: %r" % item) if source_fk not in self.sources: label = source_data.get('label', source_fk) self.sources[source_fk] = Source.create({ 'foreign_id': source_fk, 'label': label }) if source_data.get('public'): Permission.grant_foreign(self.sources[source_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.sources[source_fk].id, meta, item.data_path, move=False)
def crawl_source(self, source): if source.get('source_id') in IGNORE_SOURCES: return json_file = source.get('data', {}).get('json') url = urljoin(JSON_PATH, json_file) watchlist = Watchlist.by_foreign_id(url, { 'label': source.get('source_id') }) Permission.grant_foreign(watchlist, Role.SYSTEM_GUEST, True, False) log.info(" > OpenNames collection: %s", watchlist.label) previous_terms = watchlist.terms updated_terms = set() existing_entities = [] db.session.flush() entities = requests.get(url).json().get('entities', []) for entity in entities: if entity.get('name') is None: continue selectors = [] for on in entity.get('other_names', []): selectors.append(on.get('other_name')) for iden in entity.get('identities', []): if iden.get('number'): selectors.append(iden.get('number')) ent = Entity.by_foreign_id(entity.get('uid'), watchlist, { 'name': entity.get('name'), 'category': CATEGORIES.get(entity.get('type'), OTHER), 'data': entity, 'selectors': selectors }) updated_terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) watchlist.delete_entities(spare=existing_entities) terms = previous_terms.symmetric_difference(updated_terms) self.emit_watchlist(watchlist, terms)