def test_split_body(self): splitted = self.split() n_found = len(list(split_body(splitted[1]))) n_sol = len(self.test_body_sols) self.assertEquals(n_found, n_sol + 1) # +1 for 'defigured' article
def test_parse_article(self): splitted = self.split() texts = split_body(splitted[1]) #texts = [list(texts)[24]]; self.test_body_sols = [self.test_body_sols[23]] arts = [] for t in texts: art = parse_article(t) if art: # Json doesn't do dates art['date'] = str(art['date']) arts.append(art) # Tests.. self.assertEquals(len(arts), len(self.test_body_sols)) for i, (found, actual) in enumerate(zip(arts, self.test_body_sols)): akeys = sorted(actual.keys()) fkeys = sorted(found.keys()) if found != actual: # 'debug mode' print("Article", i, actual.get('title')) print("Found keys:", fkeys) print("Actual keys:", akeys) for key in sorted(set(fkeys) | set(akeys)): f = found.get(key) a = actual.get(key) if f != a: print("i:", i, "Key:", key, " found:", repr(f), " actual:", repr(a)) self.assertEqual(fkeys, akeys) self.assertEquals(found, actual)
def test_parse_no_header(self): header, body = split_header(self.test_text2) header = header.replace(u'\ufeff', '').strip() self.assertFalse(bool(header)) n_found = len(list(split_body(body))) self.assertEqual(n_found, 1)
def test_body_to_article(self): header, body = self.split() articles = split_body(body) articles = [parse_article(a) for a in articles] # Only testing the first article. If this contains correct # data, we assume the implementation is correct. However, # we do test the remaining articles with full_clean(). art = body_to_article(*articles[0]) self.assertEquals(art.length, 306) self.assertEquals(art.headline, "This is a headline") self.assertEquals(art.byline, ("with a byline. The article " "contains unicode characters.")) self.assertEquals(art.text, articles[0][2]) self.assertEquals(art.date, datetime.datetime(2011, 8, 31)) self.assertEquals(art.medium.name, u"B\u00f6rsen-Zeitung") self.assertEquals(art.author, "MF Tokio") self.assertEquals(eval(art.metastring), {u'update': u'2. September 2011', u'language': u'GERMAN; DEUTSCH', u'publication-type': u'Zeitung'}) # Setup environment dp = amcattest.create_test_project() # Test remaining articles for art in articles[1:]: if art is None: continue self._create_medium(art[4]) p = body_to_article(*art) p.project = dp p.full_clean()
def test_parse_article(self): splitted = self.split() texts = split_body(splitted[1]) # Json doesn't do dates arts = [] for a in texts: art = parse_article(a) if art is not None: art = list(art) art[3] = str(art[3]) arts.append(art) # Tests.. self.assertEquals(len(arts), len(self.test_body_sols)) for i, art in enumerate(self.test_body_sols): self.assertEquals(art, arts[i])
def test_body_to_article(self): header, body = self.split() articles = split_body(body) articles = [parse_article(a) for a in articles] # Only testing the first article. If this contains correct # data, we assume the implementation is correct. However, # we do test the remaining articles with full_clean(). art = body_to_article(*articles[0]) self.assertEquals(art.length, 306) self.assertEquals(art.headline, "This is a headline") self.assertEquals(art.byline, ("with a byline. The article " "contains unicode characters.")) self.assertEquals(art.text, articles[0][2]) self.assertEquals(art.date, datetime.datetime(2011, 8, 31)) self.assertEquals(art.medium.name, u"B\u00f6rsen-Zeitung") self.assertEquals(art.author, "MF Tokio") self.assertEquals( eval(art.metastring), { u'update': u'2. September 2011', u'language': u'GERMAN; DEUTSCH', u'publication-type': u'Zeitung' }) # Setup environment dp = amcattest.create_test_project() # Test remaining articles for art in articles[1:]: if art is None: continue self._create_medium(art[4]) p = body_to_article(*art) p.project = dp p.full_clean()
def test_kop_as_headline(self): # Some lexis nexis files contain "KOP: " instaed of "HEADLINE: " header, body = split_header(self.test_text3) article = parse_article(next(split_body(body))) self.assertEqual("Gretta Duisenberg oprichtster van Palestina-groep", article['title'])
def test_meta(self): a = list(split_body(self.split()[1]))[0] meta = parse_article(a)[-1] self.assertEqual(meta.pop('length').split()[0], "306")
def test_kop_as_headline(self): # Some lexis nexis files contain "KOP: " instaed of "HEADLINE: " header, body = split_header(self.test_text3) article = body_to_article(*parse_article(next(split_body(body)))) self.assertEqual("Gretta Duisenberg oprichtster van Palestina-groep", article.headline)