def test_load_sqlite(self): count = Collection.all().count() assert 0 == count, count db_uri = 'sqlite:///' + self.get_fixture_path('kek.sqlite') os.environ['ALEPH_TEST_BULK_DATABASE_URI'] = db_uri yml_path = self.get_fixture_path('kek.yml') config = load_config_file(yml_path) bulk_load(config) count = Collection.all().count() assert 1 == count, count coll = Collection.by_foreign_id('kek') assert coll.category == 'scrape', coll.category _, headers = self.login(is_admin=True) flush_index() res = self.client.get('/api/2/entities?q=friede+springer', headers=headers) assert res.status_code == 200, res assert res.json['total'] == 1, res.json res0 = res.json['results'][0] assert res0['id'] == '9895ccc1b3d6444ccc6371ae239a7d55c748a714', res0
def test_load_csv(self): count = Collection.all().count() assert 0 == count, count db_uri = 'file://' + self.get_fixture_path('experts.csv') os.environ['ALEPH_TEST_BULK_CSV'] = db_uri yml_path = self.get_fixture_path('experts.yml') config = load_config_file(yml_path) bulk_load(config) coll = Collection.by_foreign_id('experts') assert coll.category == 'scrape', coll.category _, headers = self.login(is_admin=True) self.flush_index() count = Collection.all().count() assert 1 == count, count res = self.client.get('/api/2/entities?q=Greenfield', headers=headers) assert res.status_code == 200, res assert res.json['total'] == 1, res.json res0 = res.json['results'][0] assert res0['id'] == '6897ef1acd633c229d812c1c495f030d212c9081', res0
def compute_collections(): """Update collection caches, including the global stats cache.""" authz = Authz.from_role(None) schemata = defaultdict(int) countries = defaultdict(int) categories = defaultdict(int) for collection in Collection.all(): compute_collection(collection) if authz.can(collection.id, authz.READ): categories[collection.category] += 1 things = index.get_collection_things(collection.id) for schema, count in things.items(): schemata[schema] += count for country in collection.countries: countries[country] += 1 log.info("Updating global statistics cache...") data = { "collections": sum(categories.values()), "schemata": dict(schemata), "countries": dict(countries), "categories": dict(categories), "things": sum(schemata.values()), } key = cache.key(cache.STATISTICS) cache.set_complex(key, data, expires=cache.EXPIRE)
def test_load_sqlite(self): count = Collection.all().count() assert 0 == count, count yml_path = self.get_fixture_path('kek.yml') config = load_config_file(yml_path) bulk_load(config) flush_index() count = Collection.all().count() assert 1 == count, count res = self.client.get('/api/2/entities?q=friede+springer') assert res.status_code == 200, res assert res.json['total'] == 1, res.json res0 = res.json['results'][0] assert res0['id'] == '9895ccc1b3d6444ccc6371ae239a7d55c748a714', res0
def upgrade_collections(): for collection in Collection.all(deleted=True): if collection.deleted_at is not None: delete_collection(collection, keep_metadata=True, sync=True) else: compute_collection(collection, force=True) # update global cache: compute_collections()
def upgrade_collections(): for collection in Collection.all(deleted=True): if collection.deleted_at is not None: delete_collection(collection, keep_metadata=True, sync=True, reset_sync=True) else: refresh_collection(collection.id, sync=True) compute_collection(collection, sync=True)
def index_collections(entities=False, refresh=False): q = Collection.all(deleted=True) q = q.order_by(Collection.updated_at.desc()) for collection in q: log.info("Index [%s]: %s", collection.id, collection.label) if entities and collection.deleted_at is None: index_collection_entities.delay(collection_id=collection.id) if refresh: refresh_collection(collection.id, sync=False) index.index_collection(collection)
def analyze(foreign_id=None): """Re-analyze documents in the given collection (or throughout).""" if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) analyze_collection.delay(collection.id) else: for collection in Collection.all(): analyze_collection.delay(collection.id)
def test_load_csv(self): count = Collection.all().count() assert 0 == count, count db_uri = 'file://' + self.get_fixture_path('experts.csv') os.environ['ALEPH_TEST_BULK_CSV'] = db_uri yml_path = self.get_fixture_path('experts.yml') config = load_config_file(yml_path) bulk_load(config) coll = Collection.by_foreign_id('experts') assert coll.category == 'scrape', coll.category _, headers = self.login(is_admin=True) count = Collection.all().count() assert 1 == count, count url = '/api/2/entities?filter:schemata=Thing&q=Greenfield' res = self.client.get(url, headers=headers) assert res.status_code == 200, res assert res.json['total'] == 1, res.json
def collections(secret, casefile): """List all collections.""" collections = [] for coll in Collection.all(): if secret is not None: if coll.secret != secret: continue if casefile is not None: if coll.casefile != casefile: continue collections.append((coll.foreign_id, coll.id, coll.label)) print(tabulate(collections, headers=["Foreign ID", "ID", "Label"]))
def peek_query(state): """Peek into hidden collections. This allows users to retrieve an approximate result count of a given query against those collections which they are not authorised to view. It is a rudimentary collaboration mechanism. """ filters = state.filters cq = Collection.all() cq = cq.filter(not_(Collection.id.in_(state.authz.collections_read))) cq = cq.filter(Collection.creator_id != None) # noqa cq = cq.filter(Collection.private != True) # noqa collections = {c.id: c for c in cq} filters['collection_id'] = collections.keys() q = text_query(state.text) q = { 'query': filter_query(q, filters), 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': {'field': 'collection_id', 'size': 1000} } }, '_source': False } result = es.search(index=es_index, body=q, doc_type=TYPE_DOCUMENT) roles = {} total = 0 aggs = result.get('aggregations', {}).get('collections', {}) for bucket in aggs.get('buckets', []): collection = collections.get(bucket.get('key')) if collection is None or collection.creator is None: continue total += bucket.get('doc_count') if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] return format_total({ 'roles': roles, 'active': total > 0, 'total': total })
def test_load_sqlite(self): count = Collection.all().count() assert 0 == count, count db_uri = 'sqlite:///' + self.get_fixture_path('kek.sqlite') os.environ['ALEPH_TEST_BULK_DATABASE_URI'] = db_uri yml_path = self.get_fixture_path('kek.yml') config = load_config_file(yml_path) bulk_load(config) count = Collection.all().count() assert 1 == count, count coll = Collection.by_foreign_id('kek') assert coll.category == 'scrape', coll.category _, headers = self.login(is_admin=True) url = '/api/2/entities?filter:schemata=Thing&q=friede+springer' res = self.client.get(url, headers=headers) assert res.status_code == 200, res assert res.json['total'] == 1, res.json res0 = res.json['results'][0] key = '9895ccc1b3d6444ccc6371ae239a7d55c748a714' assert res0['id'].startswith(key), res0
def update(foreign_id=None, index=False, process=False, reset=False): """Re-index all the collections and entities.""" update_roles() q = Collection.all(deleted=True) if foreign_id is not None: q = [get_collection(foreign_id)] for collection in q: if reset: reset_collection(collection, sync=True) refresh_collection(collection.id) index_collection(collection) if collection.deleted_at is not None: continue if index or process: payload = {'ingest': process} queue_task(collection, OP_PROCESS, payload=payload)
def index_collections(sync=False): for collection in Collection.all(deleted=True): compute_collection(collection, sync=sync)
def load_collections(): tx = get_graph().begin() for collection in Collection.all(): log.info("Index collection: %s", collection.label) load_collection(tx, collection) tx.commit()
def reindex_full(flush=False): """Re-index all collections.""" for collection in Collection.all(): _reindex_collection(collection, flush=flush)
def index_collections(refresh=False): for collection in Collection.all(deleted=True): if refresh: refresh_collection(collection.id, sync=True) index.index_collection(collection)
def index_collections(entities=False, refresh=False): q = Collection.all(deleted=True) q = q.order_by(Collection.updated_at.desc()) for collection in q: index_collection(collection, entities=entities, refresh=refresh)
def upgrade_collections(): for collection in Collection.all(deleted=True): if collection.deleted_at is not None: delete_collection(collection, keep_metadata=True, sync=True) continue compute_collection(collection, sync=True)
def index_collections(): for collection in Collection.all(deleted=True): index.index_collection(collection)
def compute_collections(): for collection in Collection.all(): compute_collection(collection)
def collections(): """List all collections.""" for collection in Collection.all(): print collection.id, collection.foreign_id, collection.label
def cleanup_collections(): """Reindex collections periodically.""" for collection in Collection.all(): update_collection(collection)
def exportbalkhash(foreign_id=None): collections = Collection.all() if foreign_id is not None: collections = [get_collection(foreign_id)] for collection in collections: _export_balkhash_collection(collection)
def collections(): """List all collections.""" collections = [] for coll in Collection.all(): collections.append((coll.foreign_id, coll.id, coll.label)) print(tabulate(collections, headers=['Foreign ID', 'ID', 'Label']))
def compute_collections(): for collection in Collection.all(): compute_collection(collection, sync=False)
def index_collections(): for collection in Collection.all(deleted=True): log.info("Index [%s]: %s", collection.id, collection.label) index.index_collection(collection)