def test_get_articles(self): from amcat.models import Sentence _get_articles = lambda a, s: list(get_articles(a, s)) # Should raise exception if sentences not in article article, sentences = self.create_test_sentences() s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id) self.assertRaises(ValueError, _get_articles, article, s1) # Should raise an exception if we try to split on headline self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1)) # Should return a "copy", with byline in "text" property arts = _get_articles(article, Sentence.objects.none()) map(lambda a: a.save(), arts) self.assertEquals(len(arts), 1) sbd.create_sentences(arts[0]) self.assertEquals([s.sentence for s in sentences[1:]], [s.sentence for s in arts[0].sentences.all()[1:]]) self.assertTrue("foo" in arts[0].text) # Should be able to split on byline self.assertEquals(2, len(_get_articles(article, sentences[1:2]))) a, b = _get_articles(article, sentences[4:5]) # Check if text on splitted articles contains expected self.assertTrue("Einde" not in a.text) self.assertTrue("Einde" in b.text)
def run(self, _input=None): sets = self.options['articlesets'] log.info("Listing articles from sets {sets}".format(**locals())) # Determine which articles are already splitted, and which are not all_articles = Article.objects.filter(articlesets_set__in=sets) all_ids = all_articles.values_list("id", flat=True) splitteds_ids = all_articles.filter(sentences__id__gte=0).values_list( "id", flat=True) # Get articles to be split and precache headline, byline, text to_split = Article.objects.filter(id__in=set(all_ids) - set(splitteds_ids)).only( "headline", "byline", "text") n = len(to_split) log.info("Total articles: {m}. To be split: {n}.".format( m=len(all_ids), **locals())) for i, article in enumerate(to_split): if not i % 100: log.info("Processing article {i}/{n}".format(**locals())) sbd.create_sentences(article) log.info("Splitted {n} articles!".format(**locals()))
def test_get_articles(self): from amcat.models import Sentence _get_articles = lambda a,s : list(get_articles(a,s)) # Should raise exception if sentences not in article article, sentences = self.create_test_sentences() s1 = Sentence.objects.filter(id=amcattest.create_test_sentence().id) self.assertRaises(ValueError, _get_articles, article, s1) # Should raise an exception if we try to split on headline self.assertRaises(ValueError, _get_articles, article, sentences.filter(parnr=1)) # Should return a "copy", with byline in "text" property arts = _get_articles(article, Sentence.objects.none()) map(lambda a : a.save(), arts) self.assertEquals(len(arts), 1) sbd.create_sentences(arts[0]) self.assertEquals( [s.sentence for s in sentences[1:]], [s.sentence for s in arts[0].sentences.all()[1:]] ) self.assertTrue("foo" in arts[0].text) # Should be able to split on byline self.assertEquals(2, len(_get_articles(article, sentences[1:2]))) a, b = _get_articles(article, sentences[4:5]) # Check if text on splitted articles contains expected self.assertTrue("Einde" not in a.text) self.assertTrue("Einde" in b.text)
def test_create_sentences(self): hl = "This is the title" text = "A sentence.\n\nAnother sentence. And yet a third" a = amcattest.create_test_article(title=hl, text=text) create_sentences(a) sents = Sentence.objects.filter(article=a.id) sents = set((s.parnr, s.sentnr, s.sentence) for s in sents) self.assertEqual( sents, {(1, 1, hl), (2, 1, "A sentence"), (3, 1, "Another sentence"), (3, 2, "And yet a third")})
def test_create_sentences(self): hl = "This is the title" text = "A sentence.\n\nAnother sentence. And yet a third" a = amcattest.create_test_article(title=hl, text=text) create_sentences(a) sents = Sentence.objects.filter(article=a.id) sents = set((s.parnr, s.sentnr, s.sentence) for s in sents) self.assertEqual(sents, {(1, 1, hl), (2, 1, "A sentence"), (3, 1, "Another sentence"), (3, 2, "And yet a third")})
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set( project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Add splitted articles to existing sets for aset in form_data["add_splitted_to_sets"]: aset.add_articles(articles) # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: asets = project.all_articlesets().filter(articles=article).only("id") for aset in asets: aset.add_articles(articles) if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) return locals()
def run(self, _input=None): sets = self.options['articlesets'] log.info("Listing articles from sets {sets}".format(**locals())) # Determine which articles are already splitted, and which are not all_articles = Article.objects.filter(articlesets_set__in=sets) all_ids = all_articles.values_list("id", flat=True) splitteds_ids = all_articles.filter(sentences__id__gte=0).values_list("id", flat=True) # Get articles to be split and precache headline, byline, text to_split = Article.objects.filter(id__in=set(all_ids) - set(splitteds_ids)).only("headline", "byline", "text") n = len(to_split) log.info("Total articles: {m}. To be split: {n}.".format(m=len(all_ids), **locals())) for i, article in enumerate(to_split): if not i % 100: log.info("Processing article {i}/{n}".format(**locals())) sbd.create_sentences(article) log.info("Splitted {n} articles!".format(**locals()))
def test_handle_split(self): from amcat.tools.amcates import ES from amcat.tools import amcattest from functools import partial article, sentences = self.create_test_sentences() project = amcattest.create_test_project() aset1 = amcattest.create_test_set(4, project=project) aset2 = amcattest.create_test_set(5, project=project) aset3 = amcattest.create_test_set(0) for _set in [aset1, aset2]: for _article in _set.articles.all(): sbd.create_sentences(_article) a1, a2 = aset1.articles.all()[0], aset2.articles.all()[0] aset1.add_articles([article]) aset3.add_articles([a1]) form = partial(navigator.forms.SplitArticleForm, project, article, initial={ "remove_from_sets" : False }) # Test form defaults (should do nothing!) f = form(dict()) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertEquals(5, aset1.articles.all().count()) self.assertEquals(5, aset2.articles.all().count()) self.assertEquals(1, aset3.articles.all().count()) self.assertTrue(self.article_in(aset1, article)) self.assertFalse(self.article_in(aset2, article)) self.assertFalse(self.article_in(aset3, article)) # Passing invalid form should raise exception f = form(dict(add_to_sets=[-1])) self.assertFalse(f.is_valid()) self.assertRaises(ValueError, handle_split, f, project, article, Sentence.objects.none()) # Test add_to_new_set f = form(dict(add_to_new_set="New Set 1")) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) aset = project.all_articlesets().filter(name="New Set 1") self.assertTrue(aset.exists()) self.assertEquals(project, aset[0].project) # Test add_to_sets f = form(dict(add_to_sets=[aset3.id])) self.assertFalse(f.is_valid()) f = form(dict(add_to_sets=[aset2.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(self.article_in(aset2, article)) # Test add_splitted_to_new_set f = form(dict(add_splitted_to_new_set="New Set 2")) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) aset = project.all_articlesets().filter(name="New Set 2") self.assertTrue(aset.exists()) self.assertEquals(project, aset[0].project) self.assertEquals(1, aset[0].articles.count()) self.assertFalse(self.article_in(aset[0], article)) # Test add_splitted_to_sets f = form(dict(add_splitted_to_sets=[aset2.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(article in aset2.articles.all()) # Test remove_from_sets f = form(dict(remove_from_sets=[aset1.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(article not in aset1.articles.all()) # Test remove_from_all_sets aset1.add_articles([article]) aset2.add_articles([article]) aset3.add_articles([article]) f = form(dict(remove_from_all_sets=True)) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(aset1 in project.all_articlesets()) self.assertTrue(aset2 in project.all_articlesets()) self.assertFalse(aset3 in project.all_articlesets()) self.assertFalse(self.article_in(aset1, article)) self.assertFalse(self.article_in(aset2, article)) self.assertTrue(self.article_in(aset3, article))
def create_test_sentences(self): article = amcattest.create_test_article(byline="foo", text="Dit is. Tekst.\n\n"*3 + "Einde.") sbd.create_sentences(article) return article, article.sentences.all()
def handle_split(form, project, article, sentences): articles = list(get_articles(article, sentences)) # We won't use bulk_create yet, as it bypasses save() and doesn't # insert ids for art in articles: art.save() sbd.create_sentences(art) if not form.is_valid(): raise ValueError("Form invalid: {form.errors}".format(**locals())) # Context variables for template form_data = form.cleaned_data all_sets = list(project.all_articlesets().filter(articles=article)) # Keep a list of touched sets, so we can invalidate their indices dirty_sets = ArticleSet.objects.none() # Add splitted articles to existing sets ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=aset, article=art) for art in articles for aset in form_data["add_splitted_to_sets"] ]) # Collect changed sets for field in ("add_splitted_to_sets", "remove_from_sets", "add_to_sets"): dirty_sets |= form_data[field] # Add splitted articles to sets wherin the original article live{d,s} if form_data["add_splitted_to_all"]: articlesetarts = ArticleSet.articles.through.objects.filter(article=article, articleset__project=project) ArticleSet.articles.through.objects.bulk_create([ ArticleSet.articles.through(articleset=asetart.articleset, article=art) for art in articles for asetart in articlesetarts ]) dirty_sets |= project.all_articlesets().filter(articles=article).only("id") if form_data["remove_from_sets"]: for aset in form_data["remove_from_sets"]: aset.remove_articles([article]) if form_data["remove_from_all_sets"]: for aset in ArticleSet.objects.filter(project=project, articles=article).distinct(): aset.remove_articles([article]) if form_data["add_splitted_to_new_set"]: new_splitted_set = ArticleSet.create_set(project, form_data["add_splitted_to_new_set"], articles) if form_data["add_to_sets"]: for articleset in form_data["add_to_sets"]: articleset.add_articles([article]) if form_data["add_to_new_set"]: new_set = ArticleSet.create_set(project, form_data["add_to_new_set"], [article]) for ds in dirty_sets: ds.refresh_index() return locals()
def create_test_sentences(self): article = amcattest.create_test_article(byline="foo", text="Dit is. Tekst.\n\n" * 3 + "Einde.") sbd.create_sentences(article) return article, article.sentences.all()
def test_handle_split(self): from amcat.tools import amcattest from functools import partial article, sentences = self.create_test_sentences() project = amcattest.create_test_project() aset1 = amcattest.create_test_set(4, project=project) aset2 = amcattest.create_test_set(5, project=project) aset3 = amcattest.create_test_set(0) # Creates a codingjob for each articleset, as handle_split should account # for "codedarticlesets" as well. cj1 = amcattest.create_test_job(articleset=aset1) cj2 = amcattest.create_test_job(articleset=aset2) cj3 = amcattest.create_test_job(articleset=aset3) for _set in [aset1, aset2]: for _article in _set.articles.all(): sbd.create_sentences(_article) a1, a2 = aset1.articles.all()[0], aset2.articles.all()[0] aset1.add_articles([article]) aset3.add_articles([a1]) form = partial(navigator.forms.SplitArticleForm, project, article, initial={"remove_from_sets": False}) # Test form defaults (should do nothing!) f = form(dict()) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertEquals(5, aset1.articles.all().count()) self.assertEquals(5, aset2.articles.all().count()) self.assertEquals(1, aset3.articles.all().count()) self.assertTrue(self.article_in(cj1, aset1, article)) self.assertFalse(self.article_in(cj2, aset2, article)) self.assertFalse(self.article_in(cj3, aset3, article)) # Passing invalid form should raise exception f = form(dict(add_to_sets=[-1])) self.assertFalse(f.is_valid()) self.assertRaises(ValueError, handle_split, f, project, article, Sentence.objects.none()) # Test add_to_new_set f = form(dict(add_to_new_set="New Set 1")) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) aset = project.all_articlesets().filter(name="New Set 1") self.assertTrue(aset.exists()) self.assertEquals(project, aset[0].project) # Test add_to_sets f = form(dict(add_to_sets=[aset3.id])) self.assertFalse(f.is_valid()) f = form(dict(add_to_sets=[aset2.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(self.article_in(cj2, aset2, article)) # Test add_splitted_to_new_set f = form(dict(add_splitted_to_new_set="New Set 2")) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) aset = project.all_articlesets().filter(name="New Set 2") self.assertTrue(aset.exists()) self.assertEquals(project, aset[0].project) self.assertEquals(1, aset[0].articles.count()) self.assertFalse(self.article_in(None, aset[0], article)) # Test add_splitted_to_sets f = form(dict(add_splitted_to_sets=[aset2.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(article in aset2.articles.all()) # Test remove_from_sets f = form(dict(remove_from_sets=[aset1.id])) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(article not in aset1.articles.all()) # Test remove_from_all_sets aset1.add_articles([article]) aset2.add_articles([article]) aset3.add_articles([article]) f = form(dict(remove_from_all_sets=True)) self.assertTrue(f.is_valid()) handle_split(f, project, article, Sentence.objects.none()) self.assertTrue(aset1 in project.all_articlesets()) self.assertTrue(aset2 in project.all_articlesets()) self.assertFalse(aset3 in project.all_articlesets()) self.assertFalse(self.article_in(cj1, aset1, article)) self.assertFalse(self.article_in(cj2, aset2, article)) self.assertTrue(self.article_in(cj3, aset3, article))