def set_up(self): self.a1 = amcattest.create_test_article() self.a1.text = "aap noot mies" self.a1.date = datetime.datetime(2010, 1, 1) self.a1.properties = {"author": "De Bas", "length_int": 5} self.a1.save() self.a2 = amcattest.create_test_article() self.a2.text = "aap noot geit" self.a2.date = datetime.datetime(2010, 1, 1) self.a2.properties = {"author": "Het Martijn", "length_int": 5} self.a2.save() self.a3 = amcattest.create_test_article() self.a3.text = "lamp" self.a3.date = datetime.datetime(2010, 1, 2) self.a3.properties = {"author": "Het Martijn", "length_int": 15} self.a3.save() self.aset1 = amcattest.create_test_set() self.aset1.add_articles([self.a1, self.a2]) self.aset1.refresh_index(True) self.aset2 = amcattest.create_test_set() self.aset2.add_articles([self.a3]) self.aset2.refresh_index(True) ES().refresh()
def test_scores(self): """test if scores (and matches) are as expected for various queries""" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(title="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets': s.id}, _source=["title"]) return {a.title: a.score for a in result} self.assertEqual(q("test"), {"a": 1}) a = amcattest.create_test_article(text='aap noot mies', title='m1') b = amcattest.create_test_article(text='noot mies wim zus', title='m2') c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2') d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', title='m2') ES().refresh() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(title='m2'))), {b.id}) self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(title='m2'))), {c.id}) self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(title='m2'))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(title='m2'))), {b.id}) self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(title='m2'))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(title='m2'))), {d.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(title='m2'))), {d.id})
def test_aggregation(self): """Can we create nice tables?""" p = amcattest.create_test_project() m1, m2 = [amcattest.create_test_medium() for x in [1,2]] arts1 = {amcattest.create_test_article(project=p, medium=m1) for i in range(5)} arts2 = {amcattest.create_test_article(project=p, medium=m2) for i in range(15)} aset = amcattest.create_test_set(project=p) aset.add_articles(arts1|arts2) aset.refresh_index() # can we select on mediumid self.assertEqual(self.list(projects=[p.id]), self.pks(arts1|arts2)) self.assertEqual(self.list(projects=[p.id], mediums=[m1.id]), self.pks(arts1)) # can we make a table? x = self.aggr(projects=[p.id], xAxis='medium') self.assertEqual(set(x), {(5,), (15,)}) # add second project with articles from first project in set p2 = amcattest.create_test_project() s = amcattest.create_test_set(project=p2) s.add(*(arts1|arts2)) x = self.aggr(projects=[p2.id], articlesets=[s.id], xAxis='medium')
def test_scores(self): "test if scores (and matches) are as expected for various queries" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(headline="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets':s.id}, fields=["headline"]) return {a.headline : a.score for a in result} self.assertEqual(q("test"), {"a" : 1}) m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2) d = amcattest.create_test_article(text='ik woon in een sociale huurwoning, net als anderen', medium=m2) ES().add_articles([a.id, b.id, c.id, d.id]) ES().flush() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id}) self.assertEqual(set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual(set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id}) self.assertEqual(set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
def test_deduplication(self): """Does deduplication work as it is supposed to?""" art = dict(headline="test", byline="test", date='2001-01-01', medium=amcattest.create_test_medium(), project=amcattest.create_test_project(), ) a1 = amcattest.create_test_article(**art) def q(**filters): amcates.ES().flush() return set(amcates.ES().query_ids(filters=filters)) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(check_duplicate=True,**art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a2.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(check_duplicate=True,articleset=s1, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a3.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(q(sets=s1.id), {a1.id}) # can we suppress duplicate checking? a4 = amcattest.create_test_article(check_duplicate=False, **art) self.assertTrue(Article.objects.filter(pk=a4.id).exists()) self.assertFalse(hasattr(a4, 'duplicate_of')) self.assertIn(a4.id, q(mediumid=art['medium']))
def test_filters(self): """ Do filters work properly? """ m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1, date="2001-01-01") b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date="2002-01-01") c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date="2003-01-01") s1 = amcattest.create_test_set(articles=[a,b,c]) s2 = amcattest.create_test_set(articles=[a,b]) ES().flush() q = lambda **filters: set(ES().query_ids(filters=filters)) # MEDIUM FILTER self.assertEqual(q(mediumid=m2.id), {b.id, c.id}) #### DATE FILTERS self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id}) # start is inclusive self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id}) # end is exclusive self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id}) # COMBINATION self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id}) self.assertEqual(q(end_date='2002-06-01', mediumid=m2.id), {b.id})
def test_aggregate(self): """Can we make tables per medium/date interval?""" from amcat.models import Article m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() unused = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) Article.create_articles([a,b,c,d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="mediumid")), {m1.id : 1, m2.id : 3}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="year")), {datetime(2001,1,1) : 3, datetime(2002,1,1) : 1}) self.assertEqual(dict(ES().aggregate_query(filters=dict(sets=s1.id), group_by="date", date_interval="month")), {datetime(2001,1,1) : 1, datetime(2002,1,1) : 1, datetime(2001,2,1) : 2}) # set statistics stats = ES().statistics(filters=dict(sets=s1.id)) self.assertEqual(stats.n, 4) self.assertEqual(stats.start_date, datetime(2001,1,1)) self.assertEqual(stats.end_date, datetime(2002,1,1)) # media list self.assertEqual(set(ES().list_media(filters=dict(sets=s1.id))), {m1.id, m2.id})
def set_up(self): self.a1 = amcattest.create_test_article() self.a1.text = "aap noot mies" self.a1.date = datetime.datetime(2010, 1, 1) self.a1.save() self.m1 = self.a1.medium self.a2 = amcattest.create_test_article() self.a2.text = "aap noot geit" self.a2.date = datetime.datetime(2010, 1, 1) self.a2.save() self.m2 = self.a2.medium self.a3 = amcattest.create_test_article() self.a3.text = "lamp" self.a3.date = datetime.datetime(2010, 1, 2) self.a3.save() self.m3 = self.a3.medium self.aset1 = amcattest.create_test_set() self.aset1.add_articles([self.a1, self.a2]) self.aset1.refresh_index(True) self.aset2 = amcattest.create_test_set() self.aset2.add_articles([self.a3]) self.aset2.refresh_index(True) ES().flush()
def test_filters(self): """ Do filters work properly? """ a = amcattest.create_test_article(text='aap noot mies', title='m1', date="2001-01-01") b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date="2002-01-01") c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date="2003-01-01") s1 = amcattest.create_test_set(articles=[a, b, c]) s2 = amcattest.create_test_set(articles=[a, b]) ES().refresh() q = lambda **filters: set(ES().query_ids(filters=filters)) # TITLE FILTER self.assertEqual(q(title='m2'), {b.id, c.id}) #### DATE FILTERS self.assertEqual(q(sets=s1.id, start_date='2001-06-01'), {b.id, c.id}) # start is inclusive self.assertEqual(q(sets=s1.id, start_date='2002-01-01', end_date="2002-06-01"), {b.id}) # end is exclusive self.assertEqual(q(sets=s1.id, start_date='2001-01-01', end_date="2003-01-01"), {a.id, b.id}) # COMBINATION self.assertEqual(q(sets=s2.id, start_date='2001-06-01'), {b.id}) self.assertEqual(q(end_date='2002-06-01', title='m2'), {b.id})
def test_complex_phrase_query(self): """Test complex phrase queries. DOES NOT WORK YET""" a = amcattest.create_test_article(text='aap noot mies') b = amcattest.create_test_article(text='noot mies wim zus') c = amcattest.create_test_article(text='mies bla bla bla wim zus jet') s1 = amcattest.create_test_set(articles=[a,b,c]) ES().add_articles([a.id, b.id, c.id]) self.assertEqual(set(ES().query_ids('"mi* wi*"~5', filters=dict(sets=s1.id))), {b.id, c.id})
def test_complex_phrase_query(self): """Test complex phrase queries. DOES NOT WORK YET""" a = amcattest.create_test_article(text='aap noot mies') b = amcattest.create_test_article(text='noot mies wim zus') c = amcattest.create_test_article(text='mies bla bla bla wim zus jet') s1 = amcattest.create_test_set(articles=[a, b, c]) ES().refresh() self.assertEqual( set(ES().query_ids('"mi* wi*"~5', filters=dict(sets=s1.id))), {b.id, c.id})
def test_family(self): p = amcattest.create_test_article() self.assertEqual(p.parent, None) self.assertEqual(set(p.children), set()) c1 = amcattest.create_test_article(parent_hash=p.hash) c2 = amcattest.create_test_article(parent_hash=p.hash) self.assertEqual(c1.parent, p) self.assertEqual(set(p.children), {c1, c2})
def test_create(self): """Can we create a coding job with articles?""" from amcat.models.project import Project p = amcattest.create_test_project() j = amcattest.create_test_job(project=p) self.assertIsNotNone(j) self.assertEqual(j.project, Project.objects.get(pk=p.id)) j.articleset.add(amcattest.create_test_article()) j.articleset.add(amcattest.create_test_article()) j.articleset.add(amcattest.create_test_article()) self.assertEqual(1+3, len(j.articleset.articles.all()))
def test_byline(self): aset = amcattest.create_test_set() amcattest.create_test_article(byline="bob", text="eve", articleset=aset) ES().flush() q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id})) self.assertEqual(1, len(q("byline:bob"))) self.assertEqual(0, len(q("byline:eve"))) self.assertEqual(1, len(q("bob")))
def test_create(self): """Can we create a coding job with articles?""" from amcat.models.project import Project p = amcattest.create_test_project() j = amcattest.create_test_job(project=p) self.assertIsNotNone(j) self.assertEqual(j.project, Project.objects.get(pk=p.id)) j.articleset.add(amcattest.create_test_article()) j.articleset.add(amcattest.create_test_article()) j.articleset.add(amcattest.create_test_article()) self.assertEqual(1 + 3, len(j.articleset.articles.all()))
def set_up(self): self.project = amcattest.create_test_project() self.a1 = amcattest.create_test_article(text="aap noot mies") self.a2 = amcattest.create_test_article(text="aap noot") self.a4 = amcattest.create_test_article(text="aap noot") self.a3 = amcattest.create_test_article(text="aap") self.a5 = amcattest.create_test_article(text="vuur") amcattest.create_test_set((self.a1, self.a2, self.a3, self.a4, self.a5), project=self.project) ES().flush()
def test_deduplicate(self): """One article should be deleted from artset and added to project 2""" p = amcattest.create_test_project() art1 = amcattest.create_test_article(url='blaat1', project=p) art2 = amcattest.create_test_article(url='blaat2', project=p) art3 = amcattest.create_test_article(url='blaat1', project=p) artset = amcattest.create_test_set(articles=[art1, art2, art3]) d = DeduplicateScript(articleset=artset.id) d.run(None) self.assertEqual(len(artset.articles.all()), 2) self.assertEqual(len(Article.objects.filter(project=2)), 1)
def setup(self): s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', title='m1', date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2', date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', title='m2', date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', title='m3', articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().refresh() return s1, s2, a, b, c, d, e
def test_date(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() arts = [ amcattest.create_test_article(id=1, articleset=s, medium=m, date="2001-01-01"), amcattest.create_test_article(id=2, articleset=s, medium=m, date="2001-01-01 02:00"), amcattest.create_test_article(id=3, articleset=s, medium=m, date="2001-01-02"), ] aids = [a.id for a in arts] self.assertEqual(self.do_test(arts), {1,2,3}) self.assertEqual(self.do_test(arts, ignore_date=True), {1,3})
def set_up(self): # We cannot use setUp, as use_elastic deletes indices aset = amcattest.create_test_set() m1 = amcattest.create_test_medium() m2 = amcattest.create_test_medium() a1 = amcattest.create_test_article(text="Foo", medium=m1, articleset=aset, date=datetime(2014, 4, 3)) a2 = amcattest.create_test_article(text="Bar", medium=m1, articleset=aset, date=datetime(2015, 4, 3)) a3 = amcattest.create_test_article(text="FooBar", medium=m2, articleset=aset) a4 = amcattest.create_test_article(text="BarFoo", medium=m2, articleset=aset, date=datetime(2014, 1, 3)) ES().flush() return aset, m1, m2, a1, a2, a3, a4
def test_date(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() adict = dict(text="text", headline="headline", articleset=s, medium=m) arts = [ amcattest.create_test_article(date="2001-01-01", **adict), amcattest.create_test_article(date="2001-01-01 02:00", **adict), amcattest.create_test_article(date="2001-01-02", **adict), ] aids = [a.id for a in arts] self.assertEqual(self.do_test(arts), {1,2,3}) self.assertEqual(self.do_test(arts, ignore_date=True), {1,3})
def test_narticles_in_queue(self): # articles added to a project are on the queue p = amcattest.create_test_project() self.assertEqual(AnalysisQueue.narticles_in_queue(p), 0) [amcattest.create_test_article(project=p) for _i in range(10)] self.assertEqual(AnalysisQueue.narticles_in_queue(p), 10) # articles added to a set in the project are on the queue arts = [amcattest.create_test_article() for _i in range(10)] s = amcattest.create_test_set(project=p) self.assertEqual(AnalysisQueue.narticles_in_queue(p), 10) map(s.add, arts) self.assertEqual(AnalysisQueue.narticles_in_queue(p), 20)
def set_up(self): self.project = amcattest.create_test_project() self.a1 = amcattest.create_test_article(text="aap noot mies") self.a2 = amcattest.create_test_article(text="aap noot") self.a4 = amcattest.create_test_article(text="aap noot") self.a3 = amcattest.create_test_article(text="aap") self.a5 = amcattest.create_test_article(text="vuur") amcattest.create_test_set( (self.a1, self.a2, self.a3, self.a4, self.a5), project=self.project) ES().refresh()
def test_dedup(self): s = amcattest.create_test_set() m1, m2 = [amcattest.create_test_medium() for _x in range(2)] arts = [ amcattest.create_test_article(articleset=s, medium=m1, pagenr=1, id=1), amcattest.create_test_article(articleset=s, medium=m1, pagenr=2, id=2), amcattest.create_test_article(articleset=s, medium=m2, pagenr=1, id=3), amcattest.create_test_article(articleset=s, medium=m2, pagenr=2, id=4), amcattest.create_test_article(articleset=s, medium=m2, pagenr=2, id=5) ] self.assertEqual(self.do_test(arts), {1,2,3,4}) self.assertEqual(self.do_test(arts, dry_run=True), {1,2,3,4,5}) self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2}) self.assertEqual(self.do_test(arts, ignore_page=True), {1,3})
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1, check_duplicate=False, create_id=True) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def test_not(self): aset = amcattest.create_test_set() eve = amcattest.create_test_article(text="eve", articleset=aset) paul = amcattest.create_test_article(text="paul", articleset=aset) adam = amcattest.create_test_article(text="adam", articleset=aset) ES().flush() q = lambda query: set(ES().query_ids(query, filters={"sets": aset.id})) self.assertEqual({eve.id}, q("eve")) self.assertEqual({paul.id, adam.id}, q("NOT eve")) self.assertEqual({paul.id, adam.id}, q("* NOT eve")) self.assertEqual({eve.id}, q("NOT (NOT eve)"))
def test_scores(self): "test if scores (and matches) are as expected for various queries" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(headline="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets': s.id}, fields=["headline"]) return {a.headline: a.score for a in result} self.assertEqual(q("test"), {"a": 1}) m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2) d = amcattest.create_test_article( text='ik woon in een sociale huurwoning, net als anderen', medium=m2) ES().add_articles([a.id, b.id, c.id, d.id]) ES().flush() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual( set(ES().query_ids("no*", filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual( set(ES().query_ids("zus AND jet", filters=dict(mediumid=m2.id))), {c.id}) self.assertEqual( set(ES().query_ids("zus OR jet", filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual( set(ES().query_ids('"mies wim"', filters=dict(mediumid=m2.id))), {b.id}) self.assertEqual( set(ES().query_ids('"mies wim"~5', filters=dict(mediumid=m2.id))), {b.id, c.id}) self.assertEqual( set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id}) self.assertEqual( set(ES().query_ids('"sociale huur*"', filters=dict(mediumid=m2.id))), {d.id})
def test_fuzzy(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() arts = [ amcattest.create_test_article(id=1, articleset=s, medium=m, headline="Dit is een test"), amcattest.create_test_article(id=2, articleset=s, medium=m, headline="Dit is ook een test"), amcattest.create_test_article(id=3, articleset=s, medium=m, headline="Dit is ook een tesdt"), amcattest.create_test_article(id=4, articleset=s, medium=m, headline="Is dit een test?"), ] self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2,3,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=90), {1,2,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=80), {1,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=50), {1})
def setup(self): m1 = amcattest.create_test_medium(name="De Nep-Krant") m2, m3 = [amcattest.create_test_medium() for _ in range(2)] s1 = amcattest.create_test_set() s2 = amcattest.create_test_set() a = amcattest.create_test_article(text='aap noot mies', medium=m1, date='2001-01-01', create=False) b = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-01', create=False) c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', medium=m2, date='2002-01-01', create=False) d = amcattest.create_test_article(text='noot mies wim zus', medium=m2, date='2001-02-03', create=False) e = amcattest.create_test_article(text='aap noot mies', medium=m3, articleset=s2) Article.create_articles([a, b, c, d], articleset=s1) ES().flush() return m1, m2, m3, s1, s2, a, b, c, d, e
def set_up(self): self.aset = amcattest.create_test_set() self.a1 = amcattest.create_test_article(text="de de het", articleset=self.aset) self.a2 = amcattest.create_test_article(text="de", articleset=self.aset) self.a3 = amcattest.create_test_article(text="een", articleset=self.aset) self.de = SearchQuery.from_string("de") self.het = SearchQuery.from_string("het") self.aap = SearchQuery.from_string("aap") self.filters = {"sets": [self.aset.id]} amcates.ES().flush() self.ass = Association([self.de, self.het], self.filters)
def test_refresh_index(self): """Are added/removed articles added/removed from the index?""" # TODO add/remove articles adds to index automatically (does remove?) # so refresh isn't really used. Rewrite to add to db manually s = amcattest.create_test_set() a = amcattest.create_test_article() s.add(a) self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) # check adding of existing articles to a new set: s2 = amcattest.create_test_set() s2.add(a) s2.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s2.id)))) # check that removing of articles from a set works and does not affect # other sets s2.remove_articles([a]) s2.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s2.id)))) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.remove_articles([a]) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) # test that remove from index works for larger sets s = amcattest.create_test_set() arts = [ amcattest.create_test_article(medium=a.medium) for i in range(20) ] s.add(*arts) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts}) s.remove_articles([arts[0]]) s.remove_articles([arts[-1]]) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts[1:-1]}) # test that changing an article's properties can be reindexed arts[1].medium = amcattest.create_test_medium() arts[1].save()
def test_refresh_index(self): """Are added/removed articles added/removed from the index?""" # TODO add/remove articles adds to index automatically (does remove?) # so refresh isn't really used. Rewrite to add to db manually s = amcattest.create_test_set() a = amcattest.create_test_article() s.add(a) self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) # check adding of existing articles to a new set: s2 = amcattest.create_test_set() s2.add(a) s2.refresh_index() self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s2.id)))) # check that removing of articles from a set works and does not affect # other sets s2.remove_articles([a]) s2.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s2.id)))) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.remove_articles([a]) self.assertEqual({a.id}, set(ES().query_ids(filters=dict(sets=s.id)))) s.refresh_index() self.assertEqual(set(), set(ES().query_ids(filters=dict(sets=s.id)))) # test that remove from index works for larger sets s = amcattest.create_test_set() arts = [amcattest.create_test_article(medium=a.medium) for i in range(20)] s.add(*arts) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts}) s.remove_articles([arts[0]]) s.remove_articles([arts[-1]]) s.refresh_index() solr_ids = set(ES().query_ids(filters=dict(sets=s.id))) self.assertEqual(set(solr_ids), {a.id for a in arts[1:-1]}) # test that changing an article's properties can be reindexed arts[1].medium = amcattest.create_test_medium() arts[1].save()
def test_fuzzy(self): s = amcattest.create_test_set() m = amcattest.create_test_medium() adict = dict(text="text", articleset=s, medium=m) arts = [ amcattest.create_test_article(headline="Dit is een test", **adict), amcattest.create_test_article(headline="Dit is ook een test", **adict), amcattest.create_test_article(headline="Dit is ook een tesdt", **adict), amcattest.create_test_article(headline="Is dit een test?", **adict), ] self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2,3,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=90), {1,2,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=80), {1,4}) self.assertEqual(self.do_test(arts, ignore_medium=True, headline_ratio=50), {1})
def test_dedup(self): s = amcattest.create_test_set() m1, m2 = [amcattest.create_test_medium() for _x in range(2)] adict = dict(text="text", headline="headline", articleset=s, deduplicate=False) arts = [ amcattest.create_test_article(medium=m1, pagenr=1, **adict), amcattest.create_test_article(medium=m1, pagenr=2, **adict), amcattest.create_test_article(medium=m2, pagenr=1, **adict), amcattest.create_test_article(medium=m2, pagenr=2, **adict), amcattest.create_test_article(medium=m2, pagenr=2, **adict) ] self.assertEqual(self.do_test(arts), {1,2,3,4}) self.assertEqual(self.do_test(arts, dry_run=True), {1,2,3,4,5}) self.assertEqual(self.do_test(arts, ignore_medium=True), {1,2}) self.assertEqual(self.do_test(arts, ignore_page=True), {1,3})
def test_post_id(self): a = amcattest.create_test_article() result = self._post_articles({"id": a.id}) self.assertEqual(set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id}) a2 = amcattest.create_test_article() result = self._post_articles([{"id": a.id}, {"id": a2.id}]) self.assertEqual(set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id, a2.id}) # does it also work if we just post the ids? self.setUp_set() result = self._post_articles(a.id) self.assertEqual(set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id}) result = self._post_articles([a.id, a2.id]) self.assertEqual(set(amcates.ES().query_ids(filters={"sets": self.aset.id})), {a.id, a2.id})
def test_properties(self): """Are properties stored as flat fields and with correct mapping?""" props = dict(proptest="123 test, and another", proptest2_url="http://example.org", proptest3_date="2001-01-01", proptest4_num=-1, proptest5_tag={"123 test", "and another"}, proptest6_id="123 test, and another") self.assertEqual( set(props.keys()) & set(ES().get_mapping().keys()), set()) a = amcattest.create_test_article(properties=props) mapping = ES().get_mapping() for field, ftype in dict(proptest="default", proptest2_url="url", proptest3_date="date", proptest4_num="num", proptest5_tag="tag").items(): self.assertEqual(mapping[field], settings.ES_MAPPING_TYPES[ftype]) src = ES().get(a.id) self.assertEqual(set(mapping.keys()), set(props.keys()) | ALL_FIELDS) # test if term vectors are correct, i.e. test analysis def tokens(field): tokens = list(ES().get_tokens(a.id, fields=[field])) return [w for (f, p, w) in sorted(tokens)] self.assertEqual(tokens("proptest"), ["123", "test", "and", "another"]) self.assertEqual(set(tokens("proptest5_tag")), {"123 test", "and another"}) self.assertEqual(tokens("proptest6_id"), ["123 test, and another"]) self.assertEqual(tokens("proptest2_url"), ["http://example.org"])
def test_list_media(self): """Test that list media works for more than 10 media""" from amcat.models import Article media = [amcattest.create_test_medium() for _ in range(20)] arts = [ amcattest.create_test_article(medium=m, create=False) for m in media ] s1 = amcattest.create_test_set() Article.create_articles(arts[:5], articleset=s1, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s1.get_mediums()), set(media[:5])) s2 = amcattest.create_test_set(project=s1.project) Article.create_articles(arts[5:], articleset=s2, check_duplicate=False, create_id=True) ES().flush() self.assertEqual(set(s2.get_mediums()), set(media[5:])) self.assertEqual(set(s1.project.get_mediums()), set(media))
def test_dates(self): """Test whether date deserialization works, see #66""" for d in ('2001-01-01', '1992-12-31T23:59', '2012-02-29T12:34:56.789', datetime.datetime.now()): a = amcattest.create_test_article(date=d) amcates.ES().flush() res = self.get("/api/v4/search", ids=a.id) self.assertEqual(toolkit.readDate(res['results'][0]['date']), toolkit.readDate(str(d)))
def test_delete(self): s = amcattest.create_test_set() sid = s.id s2 = amcattest.create_test_set() arts = [amcattest.create_test_article() for _x in range(10)] s.add_articles(arts[:8]) s2.add_articles(arts[6:]) ES().flush() s.delete() ES().flush() # articleset and articles only in that set are deleted self.assertRaises(ArticleSet.DoesNotExist, ArticleSet.objects.get, pk=sid) self.assertRaises(Article.DoesNotExist, Article.objects.get, pk=arts[0].id) # shared articles are not deleted self.assertEqual(Article.objects.get(pk=arts[6].id).id, arts[6].id) self.assertEqual(set(s2.articles.values_list("pk", flat=True)), {a.id for a in arts[6:]}) # index is updated self.assertEqual(ES().count(filters={"sets": sid}), 0) self.assertEqual(ES().count(filters={"sets": s2.id}), 4) self.assertRaises(elasticsearch.NotFoundError, ES().get, arts[0].id) self.assertEqual(ES().get(arts[6].id)['id'], arts[6].id)
def test_str(self): """Test unicode titles""" for offset in range(1, 10000, 1000): s = "".join(chr(offset + c) for c in range(1, 1000, 100)) a = amcattest.create_test_article(title=s) self.assertIsInstance(a.title, str) self.assertEqual(a.title, s)
def test_highlight(self): with TestSolr() as solr: blabla = "bla bla bla bla bla bla \n" * 50 text = blabla + "bla een piet is een piet piet bla bla bla ble" + blabla a = amcattest.create_test_article(text=text, headline='bla piet') solr.add_articles([a]) solr.query_highlight("piet")
def test_articles_preprocessing_reactivate(self): """Are deleted analyses undeleted when they are reactivated?""" p1 = amcattest.create_test_project() a1 = amcattest.create_test_article(project=p1) n1 = amcattest.create_test_analysis() AnalysisProject.objects.create(project=p1, analysis=n1) # baseline: check that required=actual gives a no-op aa = AnalysisArticle.objects.create(article=a1, analysis=n1) with self.checkMaxQueries(n=4): # 3 for needed, 1 for existing additions, restarts, deletions, undeletions = _get_articles_preprocessing_actions( [a1.id]) self.assertEqual(multidict(additions), {}) self.assertEqual(list(deletions), []) self.assertEqual(set(undeletions), set()) self.assertEqual(set(restarts), set()) # now set the aa to delete and see if it is reactivated aa.delete = True aa.save() with self.checkMaxQueries(n=4): # 3 for needed, 1 for existing additions, restarts, deletions, undeletions = _get_articles_preprocessing_actions( [a1.id]) self.assertEqual(multidict(additions), {}) self.assertEqual(list(deletions), []) self.assertEqual(set(undeletions), {aa.id}) self.assertEqual(set(restarts), set())
def test_get_model_field(self): article = create_test_article(text="abc", medium=create_test_medium(name="The Guardian")) self.assertEqual(article.medium.name, "The Guardian") self.assertEqual(get_model_field(article, "medium__name"), "The Guardian") self.assertEqual(get_model_field(article, "medium"), article.medium) self.assertEqual(get_model_field(article, "text"), "abc")
def test_create(self): """Can we create a set with some articles and retrieve the articles?""" s = amcattest.create_test_set() i = 7 for _x in range(i): s.add(amcattest.create_test_article()) self.assertEqual(i, len(s.articles.all()))
def test_full_refresh(self): "test full refresh, e.g. document content change" m1, m2 = [amcattest.create_test_medium() for _ in range(2)] a = amcattest.create_test_article(text='aap noot mies', medium=m1) s = amcattest.create_test_set() s.add(a) s.refresh_index() self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) a.medium = m2 a.save() s.refresh_index(full_refresh=False) # a should NOT be reindexed self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), {a.id}) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), set()) s.refresh_index(full_refresh=True) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m1.id))), set()) self.assertEqual( set(ES().query_ids(filters=dict(sets=s.id, mediumid=m2.id))), {a.id})
def test_unicode(self): """Test unicode headlines""" for offset in range(1, 10000, 1000): s = "".join(unichr(offset + c) for c in range(1, 1000, 100)) a = amcattest.create_test_article(headline=s) self.assertIsInstance(a.headline, unicode) self.assertEqual(a.headline, s)
def test_scores(self): """test if scores (and matches) are as expected for various queries""" s = amcattest.create_test_set(articles=[ amcattest.create_test_article(title="a", text='dit is een test'), ]) s.refresh_index() def q(query): result = ES().query(query, filters={'sets': s.id}, fields=["title"]) return {a.title: a.score for a in result} self.assertEqual(q("test"), {"a": 1}) a = amcattest.create_test_article(text='aap noot mies', title='m1') b = amcattest.create_test_article(text='noot mies wim zus', title='m2') c = amcattest.create_test_article(text='mies bla bla bla wim zus jet', title='m2') d = amcattest.create_test_article( text='ik woon in een sociale huurwoning, net als anderen', title='m2') ES().refresh() self.assertEqual(set(ES().query_ids("no*")), {a.id, b.id}) self.assertEqual(set(ES().query_ids("no*", filters=dict(title='m2'))), {b.id}) self.assertEqual( set(ES().query_ids("zus AND jet", filters=dict(title='m2'))), {c.id}) self.assertEqual( set(ES().query_ids("zus OR jet", filters=dict(title='m2'))), {b.id, c.id}) self.assertEqual( set(ES().query_ids('"mies wim"', filters=dict(title='m2'))), {b.id}) self.assertEqual( set(ES().query_ids('"mies wim"~5', filters=dict(title='m2'))), {b.id, c.id}) self.assertEqual( set(ES().query_ids('"sociale huur*"', filters=dict(title='m2'))), {d.id}) self.assertEqual( set(ES().query_ids('"sociale huur*"', filters=dict(title='m2'))), {d.id})
def test_get_object(self): a = amcattest.create_test_article( headline= u'\xba\xa2\u0920\u0903\u0905\u0920\u0940\u1e00\u1e80\u1eb6\u1ef3') # (why not test some unicode while we're at it...) a2 = self.get_object(ArticleMetaResource, a.id) self.assertEqual(a.headline, a2.headline) self.assertEqual(a.id, a2.id)
def test_add(self): """Can we create a set with some articles and retrieve the articles?""" s = amcattest.create_test_set() arts = [amcattest.create_test_article() for _x in range(10)] s.add_articles(arts[:5]) self.assertEqual(5, len(s.articles.all())) s.add_articles(arts) self.assertEqual(set(arts), set(s.articles.all()))
def test_query(self): """Do query and query_ids work properly?""" a = amcattest.create_test_article(headline="bla", text="artikel artikel een", date="2001-01-01") ES().flush() es_a, = ES().query("een", fields=["date", "headline"]) self.assertEqual(es_a.headline, "bla") self.assertEqual(es_a.id, a.id) ids = set(ES().query_ids(filters=dict(mediumid=a.medium_id))) self.assertEqual(ids, {a.id})
def test_get_model_field(self): article = create_test_article( text="abc", medium=create_test_medium(name="The Guardian")) self.assertEqual(article.medium.name, "The Guardian") self.assertEqual(get_model_field(article, "medium__name"), "The Guardian") self.assertEqual(get_model_field(article, "medium"), article.medium) self.assertEqual(get_model_field(article, "text"), "abc")
def test_get_triples_nqueries(self): """Does getting triples work with a single query?""" from amcat.models import Article a = amcattest.create_test_article() with self.checkMaxQueries(1): a = Article.objects.get(pk=a.id) triples = set(get_triples(a)) # just assert something silly to make sure we have data... self.assertTrue(len(triples) > 5)
def test_create_sentences(self): hl = "This is the title" text = "A sentence.\n\nAnother sentence. And yet a third" a = amcattest.create_test_article(title=hl, text=text) create_sentences(a) sents = Sentence.objects.filter(article=a.id) sents = set((s.parnr, s.sentnr, s.sentence) for s in sents) self.assertEqual( sents, {(1, 1, hl), (2, 1, "A sentence"), (3, 1, "Another sentence"), (3, 2, "And yet a third")})
def test_permissions(self): # articles should be visible if any of the sets it is in has the correct permissions role_metareader = Role.objects.get(label="metareader") role_reader = Role.objects.get(label="reader") user = amcattest.create_test_user(username="******", password="******") p1 = amcattest.create_test_project(name="p1") p2 = amcattest.create_test_project(name="p2", owner=user) s1 = amcattest.create_test_set(project=p1) a1 = amcattest.create_test_article(project=p1, articleset=s1, text="Dit is de tekst", title="hoofdlijn") client = Client() client.login(username="******", password="******") url = reverse("navigator:" + ArticleDetailsView.get_view_name(), args=[p1.id, s1.id, a1.id]) def test(url, can_view=True, can_read_article=True): response = client.get(url) self.assertEqual(response.status_code, 200 if can_view else 403) if can_view: self.assertEqual(response.context['can_view_text'], can_read_article) return response # fred can read it if p1 is reader p1.guest_role = role_reader p1.save() response = test(url) self.assertIn(b"Dit is de tekst", response.content) # but not if guest role is metareader p1.guest_role = role_metareader p1.save() response = test(url, can_read_article=False) self.assertNotIn(b"Dit is de tekst", response.content) self.assertIn(b"hoofdlijn", response.content) # and an error if there is no guest role at all p1.guest_role = None p1.save() test(url, can_view=False) # Unless the article set is added to project 2 (where Fred is owner) p2.articlesets.add(s1) test(url) # Also if project 1 has metareader as guest role p1.guest_role = role_metareader p1.save() test(url)
def test_deduplication(self): """Does deduplication work as it is supposed to?""" art = dict( headline="test", byline="test", date='2001-01-01', medium=amcattest.create_test_medium(), project=amcattest.create_test_project(), ) a1 = amcattest.create_test_article(**art) def q(**filters): amcates.ES().flush() return set(amcates.ES().query_ids(filters=filters)) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # duplicate articles should not be added a2 = amcattest.create_test_article(check_duplicate=True, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a2.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) # however, if an articleset is given the 'existing' article # should be added to that set s1 = amcattest.create_test_set() a3 = amcattest.create_test_article(check_duplicate=True, articleset=s1, **art) self.assertFalse(Article.objects.filter(pk=a2.id).exists()) self.assertEqual(a3.duplicate_of, a1.id) self.assertEqual(q(mediumid=art['medium']), {a1.id}) self.assertEqual(set(s1.get_article_ids()), {a1.id}) self.assertEqual(q(sets=s1.id), {a1.id}) # can we suppress duplicate checking? a4 = amcattest.create_test_article(check_duplicate=False, **art) self.assertTrue(Article.objects.filter(pk=a4.id).exists()) self.assertFalse(hasattr(a4, 'duplicate_of')) self.assertIn(a4.id, q(mediumid=art['medium']))