def test_post_multiple(self): a1, a2 = [test_article() for _ in [1, 2]] result = self._post_articles([a1, a2]) self.assertEqual(2, len(result)) self.assertEqual(set(result[0].keys()), {'id'}) # POST should only return IDs arts = self._get_articles()['results'] self.assertEqual({a['title'] for a in arts}, {a1['title'], a2['title']}) self.assertNotIn("text", arts[0].keys()) arts = self._get_articles(text=True)['results'] self.assertEqual({a['text'] for a in arts}, {a1['text'], a2['text']}) arts = [Article.objects.get(pk=a["id"]) for a in result] self.assertEqual(arts[0].title, a1['title']) self.assertEqual(arts[1].title, a2['title']) # Are the articles added to the index? amcates.ES().refresh() self.assertEqual( len(set(amcates.ES().query_ids(filters={"sets": self.aset.id}))), 2)
def test_create(self): s = amcattest.create_test_set() # is the set empty? (aka can we get the results) url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id) result = self.get(url) self.assertEqual(result['results'], []) body = { 'text': 'bla', 'headline': 'headline', 'date': '2013-01-01T00:00:00', 'medium': 'test_medium' } result = self.post(url, body, as_user=s.project.owner) if isinstance(result, list): result, = result self.assertEqual(result['headline'], body['headline']) result = self.get(url) self.assertEqual(len(result['results']), 1) a = result['results'][0] self.assertEqual(a['headline'], body['headline']) self.assertEqual(a['project'], s.project_id) self.assertEqual(a['length'], 2) # Is the result added to the elastic index as well? from amcat.tools import amcates amcates.ES().flush() r = list(amcates.ES().query(filters=dict(sets=s.id), fields=["text", "headline", 'medium'])) self.assertEqual(len(r), 1) self.assertEqual(r[0].medium, "test_medium") self.assertEqual(r[0].headline, "headline")
def test_create(self): """Can we create/store/index an article object?""" a = amcattest.create_test_article(create=False, date='2010-12-31', headline=u'\ua000abcd\u07b4') Article.create_articles([a], create_id=True) db_a = Article.objects.get(pk=a.id) amcates.ES().flush() es_a = list(amcates.ES().query(filters={'ids': [a.id]}, fields=["date", "headline"]))[0] self.assertEqual(a.headline, db_a.headline) self.assertEqual(a.headline, es_a.headline) self.assertEqual('2010-12-31T00:00:00', db_a.date.isoformat()) self.assertEqual('2010-12-31T00:00:00', es_a.date.isoformat())
def test_post_properties(self): a = test_article(foo='bar') res = self._post_articles(a) self.assertEqual(set(amcates.ES().query_ids(filters={"foo": "bar"})), {res["id"]}) doc = amcates.ES().get(id=res['id']) self.assertEqual(doc['foo'], 'bar') db = self._get_article(aid=res['id']) self.assertEqual(db['properties']['foo'], 'bar')
def hash_articles(cls, articleset: ArticleSet, ignore_fields: set) -> Iterable[Tuple[int, str]]: """ Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically by field name. Fields in ignore_fields will not affect the hash. Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the values of thoses fields are equal in both articles. @param articleset The articleset that is to be searched @param ignore_fields A set of fields that should not be included in the calculated hashes @return An iterable of (<article_id>, <hash>) tuples. """ all_fields = STATIC_FIELDS + list(articleset.get_used_properties()) if not ignore_fields: fields = ["hash"] else: fields = sorted(f for f in all_fields if not f in ignore_fields) x = amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": articleset.id } } } } }, fields=fields) for x in amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": articleset.id } } } } }, fields=fields): if not ignore_fields: yield int(x['_id']), x['fields']['hash'][0] continue art_tuple = tuple( str(x['fields'].get(k, [None])[0]) for k in fields) hash = hash_class(repr(art_tuple).encode()).hexdigest() yield int(x['_id']), hash
def handle(self, *args, **options): es = amcates.ES() print("Counting articles..", end=" ") sys.stdout.flush() narticles = es.count(query="*", filters={}) print(narticles) then, now = datetime.datetime.now(), datetime.datetime.now() for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)): progress = (float(i * GROUP_SIZE) / float(narticles)) * 100 print("{} of {} ({:.2f}%)".format(i * GROUP_SIZE, narticles, progress)) articles = Article.objects.filter( id__in=article_ids).select_related("medium") article_dicts = map(get_article_dict, articles) for article_dict in article_dicts: del article_dict["sets"] del article_dict["hash"] es.bulk_update_values({a["id"]: a for a in article_dicts}) then, now = now, datetime.datetime.now() print("Articles per second: ", end="") print(int(GROUP_SIZE / (now - then).total_seconds())) print("Done.")
def cache_results(self): """ Cache results for favourite projects and articles per set """ # Only cache once! (is this a hack?) self.cache_results = lambda: None try: # HACK! project = self.context['request'].GET['project_for_favourites'] except KeyError: # no project given, so nothing to do :-( self.fav_articlesets = None self.nn = None return self.fav_articlesets = set( ArticleSet.objects.filter( favourite_of_projects=project).values_list("id", flat=True)) sets = list( ArticleSet.objects.filter( Q(project=project) | Q(projects_set=project)).values_list( "id", flat=True)) self.nn = dict(amcates.ES().aggregate_query(filters={'sets': sets}, group_by='sets')) self._cached = True
def handle(self, *args, **options): es = amcates.ES() print("Counting articles..", end=" ") sys.stdout.flush() narticles = es.count(query="*", filters={}) print(narticles) then, now = datetime.datetime.now(), datetime.datetime.now() for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)): progress = (float(i * GROUP_SIZE) / float(narticles)) * 100 print("{} of {} ({:.2f}%)".format(i * GROUP_SIZE, narticles, progress)) es_articles = es.query_all(filters={"ids": article_ids}, fields=HASH_FIELDS) es.bulk_update_values( {a.id: { "hash": _get_hash(a.to_dict()) } for a in es_articles}) then, now = now, datetime.datetime.now() print("Articles per second: ", end="") print(int(GROUP_SIZE / (now - then).total_seconds())) print("Done.")
def test_post(self): """Test whether posting and retrieving an article works correctly""" self.set_up() p = amcattest.create_test_project(owner=self.user) s = amcattest.create_test_set(project=p) a = { 'date': datetime.datetime.now().isoformat(), 'headline': 'Test child', 'medium': 'Fantasy', 'text': 'Hello Universe', 'pagenr': 1, 'url': 'http://example.org', 'uuid': 'c691fadf-3c45-4ed6-93fe-f035b5f500af', } url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format( **locals()) self.post(url, a, self.user) amcates.ES().flush() res = self.get(url)["results"] self.assertEqual(len(res), 1) self.assertEqual(res[0]["headline"], a['headline']) self.assertEqual(toolkit.readDate(res[0]["date"]), toolkit.readDate(a['date'])) self.assertEqual(res[0]["uuid"], a['uuid'])
def remove_articles(self, articles, remove_from_index=True, monitor=NullMonitor()): """ Remove article from this articleset. Also removes CodedArticles (from codingjobs) and updates index if `remove_from_index` is True. @param articles: articles to be removed @type articles: iterable with indexing of integers or Article objects @param remove_from_index: notify elasticsearch of changes @type remove_from_index: bool """ monitor = monitor.submonitor(4) to_remove = {(art if type(art) is int else art.id) for art in articles} monitor.update(message="Deleting articles from database") ArticleSetArticle.objects.filter(articleset=self, article__in=articles).delete() monitor.update(message="Deleting coded articles from database") CodedArticle.objects.filter(codingjob__articleset=self, article__in=articles).delete() if remove_from_index: monitor.update(message="Deleting from index") amcates.ES().remove_from_set(self.id, to_remove) else: monitor.update() monitor.update(message="Deleting from cache") self._reset_property_cache()
def test_dates(self): """Test whether date deserialization works, see #66""" for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()): a = amcattest.create_test_article(date=d) amcates.ES().flush() res = self.get("/api/v4/search", ids=a.id) self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))
def get_hashes(self): fields = [ f for f in FIELDS if not self.options.get("skip_{}".format(f)) ] if fields == FIELDS: fields = ["hash"] setid = self.options['articleset'].id for x in amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": setid } } } } }, fields=fields): if fields == ["hash"]: hash = x['fields']['hash'][0] else: def get(flds, f): val = flds.get(f) return val[0] if val is not None else val d = {f: get(x['fields'], f) for f in fields} hash = hash_class(json.dumps(d)).hexdigest() yield int(x['_id']), hash
def test_parents(self): """Test parents via nesting""" s = amcattest.create_test_set() url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id) base = { 'text': 'bla', 'headline': 'headline', 'date': '2013-01-01T00:00:00', 'medium': 'test_medium' } child1 = dict(base, headline='c1') child2 = dict(base, headline='c2') parent = dict(base, headline='parent') body = dict(parent, children=json.dumps([child1, child2])) self.post(url, body, as_user=s.project.owner) amcates.ES().flush() # result should have 3 articles, with c1 and c2 .parent set to parent result = {a['headline']: a for a in self.get(url)['results']} self.assertEqual(len(result), 3) self.assertEqual(result['c1']['parent'], result['parent']['id']) self.assertEqual(result['c2']['parent'], result['parent']['id']) self.assertEqual(result['parent']['parent'], None)
def test_children(self): p = amcattest.create_test_project() s = amcattest.create_test_set(project=p) # need to json dump the children because the django client does weird stuff with post data children = json.dumps([{ 'date': '2001-01-02', 'headline': 'Test child', 'medium': 'Fantasy', 'text': 'Hello Universe' }]) a = { 'date': '2001-01-01', 'headline': 'Test parent', 'medium': 'My Imagination', 'text': 'Hello World', 'children': children } url = "/api/v4/projects/{p.id}/articlesets/{s.id}/articles/".format( **locals()) self.post(url, a, as_user=self.user) amcates.ES().flush() res = self.get(url)["results"] headlines = {a['headline']: a for a in res} self.assertEqual(set(headlines), {'Test parent', 'Test child'}) self.assertEqual(headlines['Test child']['parent'], headlines['Test parent']['id'])
def test_parents_multiple(self): """Can we add multiple objects with children?""" s = amcattest.create_test_set() url = ArticleViewSet.get_url(project=s.project.id, articleset=s.id) base = { 'text': 'bla', 'headline': 'headline', 'date': '2013-01-01T00:00:00', 'medium': 'test_medium' } child = dict(base, headline='c') parent = dict(base, headline='p') leaf = dict(base, headline='l') body = json.dumps([leaf, dict(parent, children=[child])]) self.post(url, body, as_user=s.project.owner, request_options=dict(content_type='application/json')) amcates.ES().flush() result = {a['headline']: a for a in self.get(url)['results']} self.assertEqual(len(result), 3) self.assertEqual(result['c']['parent'], result['p']['id']) self.assertEqual(result['p']['parent'], None) self.assertEqual(result['l']['parent'], None)
def _post_articles(self, data, expected_status=201, as_user="******", return_json=None, **url_kwargs): if as_user == "self.user": as_user = self.user if return_json is None: return_json = (expected_status // 100) == 2 if as_user: self.client.login(username=as_user.username, password="******") else: self.client.logout() url = self.url_set(**url_kwargs) response = self.client.post(url, content_type="application/json", data=json.dumps(data)) self.assertEqual( response.status_code, expected_status, "Status code {response.status_code}: {response.content}".format( **locals())) amcates.ES().refresh() if return_json: return json.loads(response.content.decode(response.charset)) else: return response
def test_dupe(self): """Test whether deduplication works""" title = 'testartikel' a = test_article(title=title) aid1 = self._post_articles(a)['id'] self.setUp_set() aid2 = self._post_articles(a)['id'] amcates.ES().refresh() # are the resulting ids identical? self.assertEqual(aid1, aid2) # is it added to elastic for this set? self.assertEqual( set(amcates.ES().query_ids(filters={'sets': self.aset.id})), {aid1}) # is it not added (ie we only have one article with this title) self.assertEqual( set(amcates.ES().query_ids(filters={'title': a['title']})), {aid1})
def get_nn(self): view = self.context["view"] if hasattr(view, 'object_list'): sets = list(view.object_list.values_list("id", flat=True)) else: sets = [view.object.id] return dict(amcates.ES().aggregate_query(filters={'sets': sets}, group_by='sets'))
def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles for aids in toolkit.splitlist( self.articles.values_list("pk", flat=True)): x = set( ArticleSetArticle.objects.filter(article_id__in=aids).exclude( articleset=self).values_list("article_id", flat=True)) todelete = set(aids) - x Article.objects.filter(pk__in=todelete).delete() amcates.ES().remove_from_set(self.id, aids) if purge_orphans: amcates.ES().purge_orphans() super(ArticleSet, self).delete() # cascade deletes all article references
def initialize(sender, **kwargs): """ Initialize the amcat database by loading data, creating the admin account, and upgrading the db if needed """ datafile = os.path.join(os.path.dirname(amcat.models.__file__), "_initial_data.json") Command().run_from_argv(["manage", "loaddata", datafile]) create_admin() amcates.ES().check_index()
def remove_articles(self, articles, remove_from_index=True): """ Add the given articles to this article set If refresh or deduplicate are True, schedule a new celery task to do this """ ArticleSetArticle.objects.filter(articleset=self, article__in=articles).delete() if remove_from_index: to_remove = {(art if type(art) is int else art.id) for art in articles} amcates.ES().remove_from_set(self.id, to_remove)
def get_adhoc_result(analysis, text, store_intermediate=True): from xtas.tasks.es import adhoc_document from xtas.tasks.pipeline import pipeline analysis = _get_analysis(analysis) es = amcates.ES() doc = adhoc_document('adhoc', es.doc_type, 'text', text=text) print "Pipelining..." return pipeline(doc, analysis, store_intermediate=store_intermediate)
def test_deduplication(self): """Does deduplication work as it is supposed to?""" # create dummy articles to have something in the db [amcattest.create_test_article() for i in range(10)] amcates.ES().refresh() art = dict(project=amcattest.create_test_project(), title="deduptest", text="test", date='2001-01-01') a1 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(_q(title='deduptest'), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(**art) amcates.ES().refresh() self.assertEqual(a2.id, a1.id) self.assertTrue(a2._duplicate) self.assertEqual(_q(title='deduptest'), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(articleset=s1, **art) amcates.ES().refresh() self.assertEqual(a3.id, a1.id) self.assertEqual(_q(title='deduptest'), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(_q(sets=s1.id), {a1.id}) # if an existing hash is set, it should be correct art2 = dict(hash=b'hash', **art) self.assertRaises(ValueError, amcattest.create_test_article, **art2) #TODO! Check duplicates within new articles art['title'] = "internaldupe" a1, a2 = (Article(**art), Article(**art)) Article.create_articles([a1, a2], articleset=s1) self.assertEqual(a1.id, a2.id) self.assertEqual(len(_q(title='internaldupe')), 1)
def delete(self, purge_orphans=True): "Delete the articleset and all articles from index and db" # which articles are only in this set? # check per N articles log.warn("Getting all articles") aids = list(self.articles.values_list("pk", flat=True)) todelete = set(aids) log.warn("Finding orphans in {} articles".format(len(aids))) for aids in toolkit.splitlist(aids, itemsperbatch=1000): x = set( ArticleSetArticle.objects.filter(article_id__in=aids).exclude( articleset=self).values_list("article_id", flat=True)) todelete -= x log.warn("Removing {} orphans from DB".format(len(todelete))) #Article.objects.filter(pk__in=todelete).delete() for i, aids in enumerate( toolkit.splitlist(todelete, itemsperbatch=10000)): if i > 1: log.warn("... batch {i} (x10k)".format(**locals())) #Article.objects.filter(pk__in=aids)._raw_delete(Article.objects.db) Article.objects.filter(pk__in=aids).only("pk").delete() log.warn("Getting set membership from elastic") esaids = list(self.get_article_ids_from_elastic()) if esaids: log.warn( "Removing set membership from elastic ({} articles)".format( len(esaids))) amcates.ES().remove_from_set(self.id, esaids) if purge_orphans: amcates.ES().refresh() amcates.ES().purge_orphans() log.warn("Deleting set (and articlesetarticle references)") super(ArticleSet, self).delete() # cascade deletes all article references log.warn("Done!")
def __init__(self, user=None, queries=None, filters=None, fields=None, hits=False): self.user = user self.queries = queries self.filters = filters or {} self.fields = [f for f in (fields or []) if f != "id"] self.es = amcates.ES() self.hits = hits self._count = None
def test_post_id(self): a = amcattest.create_test_article() result = self._post_articles({"id": a.id}) self.assertEqual( set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id}) a2 = amcattest.create_test_article() result = self._post_articles([{"id": a.id}, {"id": a2.id}]) self.assertEqual( set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id, a2.id}) # does it also work if we just post the ids? self.setUp_set() result = self._post_articles(a.id) self.assertEqual( set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id}) result = self._post_articles([a.id, a2.id]) self.assertEqual( set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id, a2.id})
def inner(*args, **kargs): from amcat.tools import amcates amcates._KNOWN_PROPERTIES = None es = amcates.ES() if not es.es.ping(): raise unittest.SkipTest("ES not enabled") es.delete_index() es.refresh() es.check_index() es.refresh() return func(*args, **kargs)
def get_result(article, analysis, store_intermediate=True, block=True): from xtas.tasks.pipeline import pipeline if not isinstance(article, int): article = article.id analysis = _get_analysis(analysis) es = amcates.ES() doc = { 'index': es.index, 'type': es.doc_type, 'id': article, 'field': 'text' } r = pipeline(doc, analysis, store_intermediate=store_intermediate) return r
def to_representation(self, aid): fields = ["title", "text"] def sort_key(token): field, offset, term = token return fields.index(field), offset tokens = amcates.ES().get_tokens(aid, fields) for (field, position, term) in sorted(tokens, key=sort_key): yield { "id": aid, "field": field, "position": position, "word": term }
def add_articles(self, articles, add_to_index=True, monitor=ProgressMonitor()): """ Add the given articles to this articleset. Implementation is exists of three parts: 1. Adding ArticleSetArticle objects 2. Adding CodedArticle objects 3. Updating index @param articles: articles to be removed @type articles: iterable with indexing of integers or Article objects @param add_to_index: notify elasticsearch of changes @type add_to_index: bool """ articles = {(art if type(art) is int else art.id) for art in articles} to_add = articles - self.get_article_ids() # Only use articles that exist to_add = list(Article.exists(to_add)) monitor.update(10, "{n} articles need to be added".format(n=len(to_add))) ArticleSetArticle.objects.bulk_create( [ ArticleSetArticle(articleset=self, article_id=artid) for artid in to_add ], batch_size=100, ) monitor.update( 20, "{n} articleset articles added to database, adding to codingjobs". format(n=len(to_add))) cjarts = [ CodedArticle(codingjob=c, article_id=a) for c, a in itertools.product(self.codingjob_set.all(), to_add) ] CodedArticle.objects.bulk_create(cjarts) monitor.update( 30, "{n} articles added to codingjobs, adding to index".format( n=len(cjarts))) if add_to_index: amcates.ES().add_to_set(self.id, to_add, monitor=monitor)