def test_parse(self): elems = split_html(self.test1_html) self.assertEqual(get_meta(elems[0]), ("Der Standard", datetime.datetime(2013,4,2), 1)) self.assertEqual(get_title(elems[0]), u'SP und VP k\xf6nnten dritte Partei f\xfcr Koalition brauchen') self.assertEqual(get_section(elems[0]), u'SEITE 1') body = get_body(elems[0]) self.assertTrue(body.startswith(u'Wien - SP\xd6 und \xd6VP')) self.assertTrue(body.endswith("hoffen. (red) Seite 7")) self.assertEqual(len(body.split("\n\n")), 3) # no of paragraphs self.assertEqual(get_meta(elems[1]), ("Wiener Zeitung", datetime.datetime(2013,4,2), 3)) self.assertEqual(get_title(elems[1]), u'Politique autrichienne als Vorbild') self.assertEqual(get_section(elems[1]), 'Europa@welt') body = get_body(elems[1]) self.assertTrue(body.startswith(u'Frankreichs Botschafter')) self.assertTrue(body.endswith("Treffen im Oktober 2012. epa")) self.assertEqual(len(body.split("\n\n")), 28) # no of paragraphs body = get_body(elems[-1]) self.assertTrue('<a href="mailto:[email protected]">[email protected]</a>' in body)
def test_parse(self): elems = split_html(self.test1_html) self.assertEqual(get_meta(elems[0]), ("Der Standard", datetime.datetime(2013,4,2), 1)) self.assertEqual(get_headline(elems[0]), u'SP und VP k\xf6nnten dritte Partei f\xfcr Koalition brauchen') self.assertEqual(get_section(elems[0]), u'SEITE 1') body = get_body(elems[0]) self.assertTrue(body.startswith(u'Wien - SP\xd6 und \xd6VP')) self.assertTrue(body.endswith("hoffen. (red) Seite 7")) self.assertEqual(len(body.split("\n\n")), 3) # no of paragraphs self.assertEqual(get_meta(elems[1]), ("Wiener Zeitung", datetime.datetime(2013,4,2), 3)) self.assertEqual(get_headline(elems[1]), u'Politique autrichienne als Vorbild') self.assertEqual(get_section(elems[1]), 'Europa@welt') body = get_body(elems[1]) self.assertTrue(body.startswith(u'Frankreichs Botschafter')) self.assertTrue(body.endswith("Treffen im Oktober 2012. epa")) self.assertEqual(len(body.split("\n\n")), 28) # no of paragraphs body = get_body(elems[-1]) self.assertTrue('<a href="mailto:[email protected]">[email protected]</a>' in body)
def test_articles(self): arts = [get_article(x) for x in split_html(self.test1_html)] arts2 = [get_article(x) for x in split_html(self.test2_html)] self.assertEqual(arts2[-1].title, 'Cafe Puls News 08:00 (08:00) - Peter Kaiser wird angelobt') self.assertEqual(arts2[-1].date, datetime.datetime(2013,4,2,8,0))
def test_split(self): elems = split_html(self.test1_html) self.assertEqual(len(elems), 21)
def test_articles(self): arts = [get_article(x) for x in split_html(self.test1_html)] arts2 = [get_article(x) for x in split_html(self.test2_html)] self.assertEqual(arts2[-1].headline, 'Cafe Puls News 08:00 (08:00) - Peter Kaiser wird angelobt') self.assertEqual(arts2[-1].date, datetime.datetime(2013,4,2,8,0))