def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), {m1.id : 1, m2.id : 3}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2}) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001,1,1)) self.assertEqual(stats.end_date, datetime(2002,1,1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})
def test_to_medium_ids(self): arts = amcattest.create_test_set(2).articles.all() m1, m2 = amcattest.create_test_medium(), amcattest.create_test_medium() self.assertEqual(set(to_medium_ids(m1)), {m1.id, }) self.assertEqual(set(to_medium_ids([m1, m2])), {m1.id, m2.id}) self.assertEqual(set(to_medium_ids(Medium.objects.filter(id__in=[m1.id, m2.id]))), {m1.id, m2.id}) self.assertEqual(set(to_medium_ids(arts.values_list("medium__id", flat=True))), {a.medium_id for a in arts})
def test_to_medium_ids(self): arts = amcattest.create_test_set(2).articles.all() m1, m2 = amcattest.create_test_medium(), amcattest.create_test_medium() self.assertEqual(set(to_medium_ids(m1)), {m1.id,}) self.assertEqual(set(to_medium_ids([m1,m2])), {m1.id, m2.id}) self.assertEqual(set(to_medium_ids(Medium.objects.filter(id__in=[m1.id, m2.id]))), {m1.id, m2.id}) self.assertEqual(set(to_medium_ids(arts.values_list("medium__id", flat=True))), {a.medium_id for a in arts})
def set_up(self): # We cannot use setUp, as use_elastic deletes indices aset = amcattest.create_test_set() m1 = amcattest.create_test_medium() m2 = amcattest.create_test_medium() a1 = amcattest.create_test_article(text="Foo", medium=m1, articleset=aset, date=datetime(2014, 4, 3)) a2 = amcattest.create_test_article(text="Bar", medium=m1, articleset=aset, date=datetime(2015, 4, 3)) a3 = amcattest.create_test_article(text="FooBar", medium=m2, articleset=aset) a4 = amcattest.create_test_article(text="BarFoo", medium=m2, articleset=aset, date=datetime(2014, 1, 3)) ES().flush() return aset, m1, m2, a1, a2, a3, a4
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "section": "\u6f22\u5b57", "pagenr": 1928390, "headline": "Headline hier.", "byline": "byline..", "length": 1928, "metastring": "Even more strange characters.. \x0C ..", "url": "https://example.com", "externalid": None, "author": None, "addressee": "Hmm", "text": "Contains invalid char \x08 woo", "medium": create_test_medium(name="abc."), "project": create_test_project() }) article.save() es = ES() es.add_articles([article.id]) hash = get_article_dict(article)["hash"] es.flush() es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"]) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(_get_hash(es_article.to_dict()), hash)
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [ amcattest.create_test_article(medium=m, create=False) for m in media ] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def create_test_article(n): return { "headline": str(n), "text": "test %s" % n, "date": datetime.date.today().isoformat(), "medium": amcattest.create_test_medium().name }
def test_scores(self): """test if scores (and matches) are as expected for various queries""" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(headline="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets': s.id}, fields=["headline"]) return {a.headline: a.score for a in result} self.assertEqual(q("test"), {"a": 1}) m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2) d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', medium=m2) ES().flush() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id}) self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
def test_filters(self): """ Do filters work properly? """ m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1, date="2001-01-01") b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date="2002-01-01") c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date="2003-01-01") s1 = amcattest.create_test_set(articles=[a, b, c]) s2 = amcattest.create_test_set(articles=[a, b]) ES().flush() q = lambda **filters: set(ES().query_ids(filters=filters)) # MEDIUM FILTER self.assertEqual(q(mediumid=m2.id), {b.id, c.id}) #### DATE FILTERS self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id}) # start is inclusive self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id}) # end is exclusive self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id}) # COMBINATION self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id}) self.assertEqual(q(end_date='2002-06-01', mediumid=m2.id), {b.id})
def test_aggregation(self): """Can we create nice tables?""" p = amcattest.create_test_project() m1, m2 = [amcattest.create_test_medium() for x in [1,2]] arts1 = {amcattest.create_test_article(project=p, medium=m1) for i in range(5)} arts2 = {amcattest.create_test_article(project=p, medium=m2) for i in range(15)} aset = amcattest.create_test_set(project=p) aset.add_articles(arts1|arts2) aset.refresh_index() # can we select on mediumid self.assertEqual(self.list(projects=[p.id]), self.pks(arts1|arts2)) self.assertEqual(self.list(projects=[p.id], mediums=[m1.id]), self.pks(arts1)) # can we make a table? x = self.aggr(projects=[p.id], xAxis='medium') self.assertEqual(set(x), {(5,), (15,)}) # add second project with articles from first project in set p2 = amcattest.create_test_project() s = amcattest.create_test_set(project=p2) s.add(*(arts1|arts2)) x = self.aggr(projects=[p2.id], articlesets=[s.id], xAxis='medium')
def test_get_model_field(self): article = create_test_article(text="abc", medium=create_test_medium(name="The Guardian")) self.assertEqual(article.medium.name, "The Guardian") self.assertEqual(get_model_field(article, "medium__name"), "The Guardian") self.assertEqual(get_model_field(article, "medium"), article.medium) self.assertEqual(get_model_field(article, "text"), "abc")
def test_deduplication(self): """Does deduplication work as it is supposed to?""" art = dict(headline="test", byline="test", date='2001-01-01', medium=amcattest.create_test_medium(), project=amcattest.create_test_project(), ) a1 = amcattest.create_test_article(**art) def q(**filters): amcates.ES().flush() return set(amcates.ES().query_ids(filters=filters)) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(check_duplicate=True,**art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a2.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(check_duplicate=True,articleset=s1, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a3.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(q(sets=s1.id), {a1.id}) # can we suppress duplicate checking? a4 = amcattest.create_test_article(check_duplicate=False, **art) self.assertTrue(Article.objects.filter(pk=a4.id).exists()) self.assertFalse(hasattr(a4, 'duplicate_of')) self.assertIn(a4.id, q(mediumid=art['medium']))
def test_filters(self): """ Do filters work properly? """ m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1, date="2001-01-01") b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date="2002-01-01") c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date="2003-01-01") s1 = amcattest.create_test_set(articles=[a,b,c]) s2 = amcattest.create_test_set(articles=[a,b]) ES().flush() q = lambda **filters: set(ES().query_ids(filters=filters)) # MEDIUM FILTER self.assertEqual(q(mediumid=m2.id), {b.id, c.id}) #### DATE FILTERS self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id}) # start is inclusive self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id}) # end is exclusive self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id}) # COMBINATION self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id}) self.assertEqual(q(end_date='2002-06-01', mediumid=m2.id), {b.id})
def test_query_args_from_form(self): m = amcattest.create_test_medium() s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() form = dict(sortColumn='', useSolr=True, start=100, length=100, articleids=[], articlesets=[s1, s2], mediums=[m], projects=[], columns=[ u'article_id', u'date', u'medium_id', u'medium_name', u'headline' ], highlight=False, columnInterval='month', datetype='all', sortOrder='') args = query_args_from_form(form) self.assertEqual( args, dict(start=100, rows=100, filters=[ u'mediumid:{m.id}'.format(**locals()), u'sets:{s1.id} OR sets:{s2.id}'.format(**locals()) ]))
def test_full_refresh(self): "test full refresh, e.g. document content change" m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) s = amcattest.create_test_set() s.add(a) s.refresh_index() self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) a.medium = m2 a.save() s.refresh_index(full_refresh=False) # a should NOT be reindexed self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), set()) s.refresh_index(full_refresh=True) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), set()) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), {a.id})
def test_scores(self): "test if scores (and matches) are as expected for various queries" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(headline="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets':s.id}, fields=["headline"]) return {a.headline : a.score for a in result} self.assertEqual(q("test"), {"a" : 1}) m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2) d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', medium=m2) ES().add_articles([a.id, b.id, c.id, d.id]) ES().flush() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id}) self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
def test_get_model_field(self): article = create_test_article( text="abc", medium=create_test_medium(name="The Guardian")) self.assertEqual(article.medium.name, "The Guardian") self.assertEqual(get_model_field(article, "medium__name"), "The Guardian") self.assertEqual(get_model_field(article, "medium"), article.medium) self.assertEqual(get_model_field(article, "text"), "abc")
def test_get_mediums(self): aset = amcattest.create_test_set(0) media = [amcattest.create_test_medium(name="Test__"+str(i)) for i in range(10)] for m in media: aset.add(amcattest.create_test_article(medium=m)) aset.refresh_index() # Test if medium really added self.assertEqual(set(aset.get_mediums()), set(media))
def test_get_mediums(self): aset = amcattest.create_test_set(0) media = [ amcattest.create_test_medium(name="Test__" + str(i)) for i in range(10) ] for m in media: aset.add(amcattest.create_test_article(medium=m)) aset.refresh_index() # Test if medium really added self.assertEqual(set(aset.get_mediums()), set(media))
def test_date(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() arts = [ amcattest.create_test_article(id=1, articleset=s, medium=m, date="2001-01-01"), amcattest.create_test_article(id=2, articleset=s, medium=m, date="2001-01-01 02:00"), amcattest.create_test_article(id=3, articleset=s, medium=m, date="2001-01-02"), ] aids = [a.id for a in arts] self.assertEqual(self.do_test(arts), {1,2,3}) self.assertEqual(self.do_test(arts, ignore_date=True), {1,3})
def test_date(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() adict = dict(text="text", headline="headline", articleset=s, medium=m) arts = [ amcattest.create_test_article(date="2001-01-01", **adict), amcattest.create_test_article(date="2001-01-01 02:00", **adict), amcattest.create_test_article(date="2001-01-02", **adict), ] aids = [a.id for a in arts] self.assertEqual(self.do_test(arts), {1,2,3}) self.assertEqual(self.do_test(arts, ignore_date=True), {1,3})
def test_get_mediums(self): from django.core.cache import cache cache.clear() AmCAT.enable_mediums_cache() aset = amcattest.create_test_set(0) media = [amcattest.create_test_medium(name="Test__"+str(i)) for i in range(10)] for m in media: aset.add(amcattest.create_test_article(medium=m)) aset.refresh_index() # Test if medium really added self.assertEqual(set(aset.get_mediums()), set(media))
def test_dupe(self): """Test whether deduplication works""" m = amcattest.create_test_medium() a = test_article(medium=m.name) aid1 = self._post_articles(a)['id'] self.setUp_set() aid2 = self._post_articles(a)['id'] # are the resulting ids identical? self.assertEqual(aid1, aid2) # is it not added (ie we only have one article with this medium) self.assertEqual(set(amcates.ES().query_ids(filters={'mediumid':m.id})), {aid1}) # is it added to elastic for this set? self.assertEqual(set(amcates.ES().query_ids(filters={'sets':self.aset.id})), {aid1})
def test_fuzzy(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() arts = [ amcattest.create_test_article(id=1, articleset=s, medium=m, headline="Dit is een test"), amcattest.create_test_article(id=2, articleset=s, medium=m, headline="Dit is ook een test"), amcattest.create_test_article(id=3, articleset=s, medium=m, headline="Dit is ook een tesdt"), amcattest.create_test_article(id=4, articleset=s, medium=m, headline="Is dit een test?"), ] self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2,3,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=90), {1,2,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=80), {1,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=50), {1})
def test_dedup(self): s = amcattest.create_test_set() m1, m2 = [amcattest.create_test_medium() for _x in range(2)] arts = [ amcattest.create_test_article(articleset=s, medium=m1, pagenr=1, id=1), amcattest.create_test_article(articleset=s, medium=m1, pagenr=2, id=2), amcattest.create_test_article(articleset=s, medium=m2, pagenr=1, id=3), amcattest.create_test_article(articleset=s, medium=m2, pagenr=2, id=4), amcattest.create_test_article(articleset=s, medium=m2, pagenr=2, id=5) ] self.assertEqual(self.do_test(arts), {1,2,3,4}) self.assertEqual(self.do_test(arts, dry_run=True), {1,2,3,4,5}) self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2}) self.assertEqual(self.do_test(arts, ignore_page=True), {1,3})
def test_refresh_index(self): """Are added/removed articles added/removed from the index?""" # TODO add/remove articles adds to index automatically (does remove?) # so refresh isn't really used. Rewrite to add to db manually s = amcattest.create_test_set() a = amcattest.create_test_article() s.add(a) self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) # check adding of existing articles to a new set: s2 = amcattest.create_test_set() s2.add(a) s2.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s2.id)))) # check that removing of articles from a set works and does not affect # other sets s2.remove_articles([a]) s2.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s2.id)))) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.remove_articles([a]) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) # test that remove from index works for larger sets s = amcattest.create_test_set() arts = [amcattest.create_test_article(medium=a.medium) for i in range(20)] s.add(*arts) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts}) s.remove_articles([arts[0]]) s.remove_articles([arts[-1]]) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts[1:-1]}) # test that changing an article's properties can be reindexed arts[1].medium = amcattest.create_test_medium() arts[1].save()
def test_refresh_index(self): """Are added/removed articles added/removed from the index?""" # TODO add/remove articles adds to index automatically (does remove?) # so refresh isn't really used. Rewrite to add to db manually s = amcattest.create_test_set() a = amcattest.create_test_article() s.add(a) self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) # check adding of existing articles to a new set: s2 = amcattest.create_test_set() s2.add(a) s2.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s2.id)))) # check that removing of articles from a set works and does not affect # other sets s2.remove_articles([a]) s2.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s2.id)))) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.remove_articles([a]) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) # test that remove from index works for larger sets s = amcattest.create_test_set() arts = [ amcattest.create_test_article(medium=a.medium) for i in range(20) ] s.add(*arts) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts}) s.remove_articles([arts[0]]) s.remove_articles([arts[-1]]) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts[1:-1]}) # test that changing an article's properties can be reindexed arts[1].medium = amcattest.create_test_medium() arts[1].save()
def test_fuzzy(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() adict = dict(text="text", articleset=s, medium=m) arts = [ amcattest.create_test_article(headline="Dit is een test", **adict), amcattest.create_test_article(headline="Dit is ook een test", **adict), amcattest.create_test_article(headline="Dit is ook een tesdt", **adict), amcattest.create_test_article(headline="Is dit een test?", **adict), ] self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2,3,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=90), {1,2,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=80), {1,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=50), {1})
def test_dedup(self): s = amcattest.create_test_set() m1, m2 = [amcattest.create_test_medium() for _x in range(2)] adict = dict(text="text", headline="headline", articleset=s, deduplicate=False) arts = [ amcattest.create_test_article(medium=m1, pagenr=1, **adict), amcattest.create_test_article(medium=m1, pagenr=2, **adict), amcattest.create_test_article(medium=m2, pagenr=1, **adict), amcattest.create_test_article(medium=m2, pagenr=2, **adict), amcattest.create_test_article(medium=m2, pagenr=2, **adict) ] self.assertEqual(self.do_test(arts), {1,2,3,4}) self.assertEqual(self.do_test(arts, dry_run=True), {1,2,3,4,5}) self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2}) self.assertEqual(self.do_test(arts, ignore_page=True), {1,3})
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213)] Article.create_articles(arts, s) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def todo_test_zip(self): base = dict(project=amcattest.create_test_project().id, articlesets=[amcattest.create_test_set().id], medium=amcattest.create_test_medium().id) with NamedTemporaryFile(prefix=u"upload_test", suffix=".zip") as f: with zipfile.ZipFile(f, "w") as zf: zf.writestr("headline1.txt", "TEXT1") zf.writestr("x/headline2.txt", "TEXT2") f.flush() s = Text(file=File(f), date='2010-01-01', **base) arts = list(ArticleSet.objects.get(id=s.run()[0]).articles.all()) self.assertEqual({a.headline for a in arts}, {"headline1", "headline2"}) self.assertEqual({a.section for a in arts}, {'', "x"}) self.assertEqual({a.text for a in arts}, {"TEXT1", "TEXT2"})
def test_deduplication(self): """Does deduplication work as it is supposed to?""" # create dummy articles to have something in the db [amcattest.create_test_article() for i in range(10)] amcates.ES().flush() art = dict(headline="test", text="test", byline="test", date='2001-01-01', medium=amcattest.create_test_medium(), project=amcattest.create_test_project(), ) a1 = amcattest.create_test_article(**art) amcates.ES().flush() self.assertEqual(_q(mediumid=art['medium']), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(**art) amcates.ES().flush() self.assertEqual(a2.id, a1.id) self.assertTrue(a2.duplicate) self.assertEqual(_q(mediumid=art['medium']), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(articleset=s1, **art) amcates.ES().flush() self.assertEqual(a3.id, a1.id) self.assertEqual(_q(mediumid=art['medium']), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(_q(sets=s1.id), {a1.id}) # a dupe with a non-identical uuid is not a dupe uu = uuid.uuid4() a4 = amcattest.create_test_article(uuid=uu, **art) self.assertFalse(a4.duplicate) self.assertEqual(a4.uuid, uu) # if an existing uuid is set, it should be a perfect duplicate art['uuid'] = a1.uuid a5 = amcattest.create_test_article(**art) # okay amcates.ES().flush() self.assertEqual(_q(mediumid=art['medium']), {a1.id, a4.id}) # a5 is a dupe art['headline']="not the same" self.assertRaises(ValueError, amcattest.create_test_article, **art) # not okay
def test_save_parent(self): """Can we save objects with new and existing parents?""" m = amcattest.create_test_medium() root = create_test_article() s = create_test_set() structure = {1:0, 2:1, 3:1, 4:0} adict= dict(medium=m, date=datetime.date(2001,1,1), project=s.project) def _articles(n, structure): articles = [Article(headline=str(i), text=str(i), **adict) for i in range(n)] articles[0].parent = root for child, parent in structure.items(): articles[child].parent = articles[parent] return articles # Trees are 3 levels deep, so it should take 3 queries to complete this request articles = _articles(5, structure) self.assertNumQueries(3, Article.create_articles, articles) ids = _q(mediumid=m.id) self.assertEqual(len(ids), 5) a = {int(a.text):a for a in Article.objects.filter(pk__in=ids)} # Are the parent properties set correctly? self.assertEqual(a[0].parent, root) for child, parent in structure.items(): articles[child].parent = articles[parent] self.assertEqual(a[child].parent, a[parent]) # can we save it again without errors? (And without queries, since it's all dupes articles = _articles(5, structure) self.assertNumQueries(0, Article.create_articles, articles) self.assertEqual(len(_q(mediumid=m.id)), 5) # Can we insert new articles together with dupes? structure.update({5:1, 6:1}) articles = _articles(7, structure) articles[6].parent = a[1] # existing article amcates.ES().flush() # (inefficiency: it knows it can save 6 immediately, doesn't know it can also save 5 until dedup) self.assertNumQueries(2, Article.create_articles, articles) ids = _q(mediumid=m.id) self.assertEqual(len(ids), 7) a = {int(a.text):a for a in Article.objects.filter(pk__in=ids)} self.assertEqual(a[0].parent, root) for child, parent in structure.items(): articles[child].parent = articles[parent] self.assertEqual(a[child].parent, a[parent])
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [amcattest.create_test_article(medium=m, create=False) for m in media] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def test_article(self): from django.core.files import File base = dict(project=amcattest.create_test_project().id, articleset=amcattest.create_test_set().id, medium=amcattest.create_test_medium().id) from tempfile import NamedTemporaryFile with NamedTemporaryFile(prefix=u"1999-12-31_\u0409\u0429\u0449\u04c3", suffix=".txt") as f: text = u'H. C. Andersens for\xe6ldre tilh\xf8rte samfundets laveste lag.' f.write(text.encode('utf-8')) f.flush() dn, fn = os.path.split(f.name) fn, ext = os.path.splitext(fn) a, = Text( dict(date='2010-01-01', headline='simple testxxx', file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, 'simple test') self.assertEqual(a.date.isoformat()[:10], '2010-01-01') self.assertEqual(a.text, text) # test autodect headline from filename a, = Text( dict(date='2010-01-01', file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, fn) self.assertEqual(a.date.isoformat()[:10], '2010-01-01') self.assertEqual(a.text, text) self.assertEqual(a.section, dn) # test autodect date and headline from filename a, = Text(dict(file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, fn.replace("1999-12-31_", "")) self.assertEqual(a.date.isoformat()[:10], '1999-12-31') self.assertEqual(a.text, text)
def test_add_many(self): """Can we add a large number of articles from one set to another?""" s = amcattest.create_test_set() s2 = amcattest.create_test_set() m = amcattest.create_test_medium() p = amcattest.create_test_project() arts = [ amcattest.create_test_article(project=p, medium=m, create=False) for _x in range(1213) ] Article.create_articles(arts, s, create_id=True) ES().flush() self.assertEqual(len(arts), s.get_count()) s2.add_articles(arts, monitor=ProgressMonitor()) ES().flush() self.assertEqual(len(arts), s2.get_count()) print(s2.get_count())
def test_full_refresh(self): "test full refresh, e.g. document content change" m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) s = amcattest.create_test_set() s.add(a) s.refresh_index() self.assertEqual(set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) a.medium = m2 a.save() s.refresh_index(full_refresh=False) # a should NOT be reindexed self.assertEqual(set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) self.assertEqual(set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), set()) s.refresh_index(full_refresh=True) self.assertEqual(set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), set()) self.assertEqual(set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), {a.id})
def test_deduplication(self): """Does deduplication work as it is supposed to?""" art = dict( headline="test", byline="test", date='2001-01-01', medium=amcattest.create_test_medium(), project=amcattest.create_test_project(), ) a1 = amcattest.create_test_article(**art) def q(**filters): amcates.ES().flush() return set(amcates.ES().query_ids(filters=filters)) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(check_duplicate=True, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a2.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(check_duplicate=True, articleset=s1, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a3.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(q(sets=s1.id), {a1.id}) # can we suppress duplicate checking? a4 = amcattest.create_test_article(check_duplicate=False, **art) self.assertTrue(Article.objects.filter(pk=a4.id).exists()) self.assertFalse(hasattr(a4, 'duplicate_of')) self.assertIn(a4.id, q(mediumid=art['medium']))
def test_create_article_dicts(self): """Test whether article dicts are created correctly""" from amcat.models.article import Article s1, s2 = [amcattest.create_test_set() for _x in range(2)] p = amcattest.create_test_project() m = amcattest.create_test_medium() a1 = amcattest.create_test_article(headline="bla \x1d blo", text="test", project=p, medium=m) a2 = amcattest.create_test_article(headline="blabla", text="t\xe9st!", byline="\u0904\u0905 test", project=p, medium=m) s1.add(a1) s2.add(a1) s2.add(a2) # force getting to make db rountrip and deserialize date ad1, ad2 = list( _get_article_dicts(Article.objects.filter(pk__in=[a1.id, a2.id]))) for k, v in dict(id=a1.id, headline="bla blo", body="test", byline=None, section=None, projectid=p.id, mediumid=m.id, sets=set([s1.id, s2.id])).items(): self.assertEqual(ad1[k], v, "Article 1 %s %r!=%r" % (k, ad1[k], v)) for k, v in dict(id=a2.id, headline="blabla", body="t\xe9st!", byline="\u0904\u0905 test", section=None, projectid=p.id, mediumid=m.id, sets=set([s2.id])).items(): self.assertEqual(ad2[k], v, "Article 2 %s %r!=%r" % (k, ad2[k], v))
def test_zip(self): from tempfile import NamedTemporaryFile from django.core.files import File import zipfile base = dict(project=amcattest.create_test_project().id, articleset=amcattest.create_test_set().id, medium=amcattest.create_test_medium().id) with NamedTemporaryFile(prefix=u"upload_test", suffix=".zip") as f: with zipfile.ZipFile(f, "w") as zf: zf.writestr("headline1.txt", "TEXT1") zf.writestr("x/headline2.txt", "TEXT2") f.flush() s = Text(file=File(f), date='2010-01-01', **base) arts = list(s.run().articles.all()) self.assertEqual({a.headline for a in arts}, {"headline1", "headline2"}) self.assertEqual({a.section for a in arts}, {'', "x"}) self.assertEqual({a.text for a in arts}, {"TEXT1", "TEXT2"})
def test_zip(self): from tempfile import NamedTemporaryFile from django.core.files import File import zipfile base = dict(project=amcattest.create_test_project().id, articleset=amcattest.create_test_set().id, medium=amcattest.create_test_medium().id) with NamedTemporaryFile(prefix=u"upload_test", suffix=".zip") as f: with zipfile.ZipFile(f, "w") as zf: zf.writestr("headline1.txt", "TEXT1") zf.writestr("x/headline2.txt", "TEXT2") f.flush() s = Text(file=File(f),date='2010-01-01', **base) arts = s.run() self.assertEqual({a.headline for a in arts}, {"headline1","headline2"}) self.assertEqual({a.section for a in arts}, {'',"x"}) self.assertEqual({a.text for a in arts}, {"TEXT1", "TEXT2"})
def test_aggregation(self): """Can we create nice tables?""" p = amcattest.create_test_project() m1, m2 = [amcattest.create_test_medium() for x in [1,2]] arts1 = {amcattest.create_test_article(project=p, medium=m1) for i in range(5)} arts2 = {amcattest.create_test_article(project=p, medium=m2) for i in range(15)} # can we select on mediumid self.assertEqual(self.list(projects=[p.id]), arts1|arts2) self.assertEqual(self.list(projects=[p.id], mediums=[m1.id]), arts1) # can we make a table? x = self.aggr(projects=[p.id], xAxis='medium') self.assertEqual(set(x), {(5,), (15,)}) # add second project with articles from first project in set p2 = amcattest.create_test_project() s = amcattest.create_test_set(project=p2) s.add(*(arts1|arts2)) x = self.aggr(projects=[p2.id], articlesets=[s.id], xAxis='medium')
def test_article(self): from django.core.files import File base = dict(project=amcattest.create_test_project().id, articleset=amcattest.create_test_set().id, medium=amcattest.create_test_medium().id) from tempfile import NamedTemporaryFile with NamedTemporaryFile(prefix=u"1999-12-31_\u0409\u0429\u0449\u04c3", suffix=".txt") as f: text = u'H. C. Andersens for\xe6ldre tilh\xf8rte samfundets laveste lag.' f.write(text.encode('utf-8')) f.flush() dn, fn = os.path.split(f.name) fn, ext = os.path.splitext(fn) a, = Text(dict(date='2010-01-01', headline='simple testxxx', file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, 'simple test') self.assertEqual(a.date.isoformat()[:10], '2010-01-01') self.assertEqual(a.text, text) # test autodect headline from filename a, = Text(dict(date='2010-01-01', file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, fn) self.assertEqual(a.date.isoformat()[:10], '2010-01-01') self.assertEqual(a.text, text) self.assertEqual(a.section, dn) # test autodect date and headline from filename a, = Text(dict(file=File(open(f.name)), encoding=0, **base)).run() a = Article.objects.get(pk=a.id) self.assertEqual(a.headline, fn.replace("1999-12-31_", "")) self.assertEqual(a.date.isoformat()[:10], '1999-12-31') self.assertEqual(a.text, text)
def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False) ES().flush() # counts per mediumid self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), { m1.id: 1, m2.id: 3 }) # counts per medium (name) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="medium")), { m1.name: 1, m2.name: 3 }) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), { datetime(2001, 1, 1): 3, datetime(2002, 1, 1): 1 }) self.assertEqual( dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), { datetime(2001, 1, 1): 1, datetime(2002, 1, 1): 1, datetime(2001, 2, 1): 2 }) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001, 1, 1)) self.assertEqual(stats.end_date, datetime(2002, 1, 1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})
def test_deduplicate(self): """One article should be deleted from artset and added to project 2""" p = amcattest.create_test_project() m = amcattest.create_test_medium() art2 = amcattest.create_test_article(headline='blaat1', project=p, medium=m, text=""" bla bla bla bla bla bla bla var c=0; var t; var timer_is_on=0; function timedCount() { document.getElementById('txt').value=c; c=c+1; t=setTimeout(function(){timedCount()},1000); } function doTimer() { if (!timer_is_on) { timer_is_on=1; timedCount(); } } function stopCount() { clearTimeout(t); timer_is_on=0; } """, date=m_date(2012, 01, 01), section="kaas", metastring={ 'moet_door': False, 'delete?': True, 'mist': 'link, heeft wel meer tekst' }) art1 = amcattest.create_test_article( headline='blaat1', project=p, text=""" bla bla bla [bla](http://www.bla.com) bla bla bla """, date=m_date(2012, 01, 01), section="kaas", metastring={ 'moet_door': True, 'delete?': False, 'mist': 'niets' }, medium=m, ) art3 = amcattest.create_test_article(headline='blaat1', project=p, medium=m, text=""" bla bla bla [bla](http://www.bla.com) bla bla bla """, date=m_date(2012, 01, 01), metastring={'mist': '3 fields'}) art4 = amcattest.create_test_article(headline='blaat1', project=p, medium=m, text=""" bla bla bla [bla](http://www.bla.com) bla bla bla """, date=m_date(2012, 01, 01), section="kaas", metastring={ 'moet_door': False, 'delete?': True, 'mist': 'later gemaakt' }) artset = amcattest.create_test_set(articles=[art1, art2, art3, art4]) d = DeduplicateScript(articleset=artset.id, slow=True) d.run(None) self.assertEqual(len(artset.articles.all()), 1) self.assertEqual(len(Article.objects.filter(project=p)), 4) self.assertEqual(art1.pk, artset.articles.all()[0].pk)
def test_medium_name(self): m = amcattest.create_test_medium(name="de testkrant") a = amcattest.create_test_article(medium=m) r = amcates.ES().query(filters={"id": a.id}, fields=["medium"]) print(r)