Esempio n. 1
0
	def test_span2(self):
		splitter = HtmlSplitter('abcdefghijkl')
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'cde')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'fghijkl')
		self.assertEqual(unicode(splitter.get_span(5, 13)),
			u'fghijkl')
Esempio n. 2
0
def _separate_events(events):
	new_events = []
	for e in events:
		htmlsplitter = HtmlSplitter(e['content'])
		separated = (htmlsplitter.get_span(start, end) \
			for start, end in _sentence_splitter.span_tokenize(htmlsplitter.text_string))
		for s in separated:
			# not sure whether to go for interface consistency or not having to reparse
			new_events.append({'date': e['date'], 'date_string': e['date_string'], 'content': unicode(s)})
	return new_events
Esempio n. 3
0
	def test_span1(self):
		splitter = HtmlSplitter(self.data)

		self.assertEqual(unicode(splitter.get_span(0, 5)),
			u'0abc4')
		self.assertEqual(unicode(splitter.get_span(5, 8)),
			u'<p><b><a>14d</a></b></p>')
		self.assertEqual(unicode(splitter.get_span(0, 9)),
			u'0abc4<p><b><a>14de</a></b></p>')
		self.assertEqual(unicode(splitter.get_span(2, 24)),
			u'bc4<p><b><a>14defg21</a></b>30hijk37</p>42m')
Esempio n. 4
0
	def test_span4(self):
		splitter = HtmlSplitter('<p class="blah">abc<br/>def</p>ghi<br/>jkl<br/>')

		top_level_ranges = splitter._top_level_ranges
		self.assertEqual(
			[r['range'] for r in top_level_ranges],
			[(0, 6), (6, 9), (9, 9), (9, 12), (12, 12)])
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'<p class="blah">c<br/>de</p>')
		self.assertEqual(unicode(splitter.get_span(3, 4)),
			u'<p class="blah">d</p>')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'<p class="blah">f</p>ghi<br/>jkl')
Esempio n. 5
0
def parse_date_html(html_string):
	"""Takes a string that contains html, and returns (date, date_string,
	content) as a tuple. For now, date is an int that represents the year.
	Negative numbers are B.C. and positive are A.D. years. If there is no date
	that can be parsed, returns None.
	"""

	# preprocess to add newlines after <br />, or else get_text smushes things
	# together
	soup = BeautifulSoup(html_string)
	for el in soup.descendants:
		if el.name == 'br':
			el.insert_after(soup.new_string('\n'))
			el.insert_before(soup.new_string('\n'))

	html_splitter = HtmlSplitter(unicode(soup))
	s = html_splitter.text_string

	content_offset = 0

	# strip out all non-letter/digit characters from the beginning
	m = re.search('^[^\d\w]+', s)
	if m:
		content_offset += m.end()
	if not s:
		return None

	# get the date
	extract = parse_date_text(s[content_offset:])
	if not extract:
		return None
	(date, date_index) = extract
	date_string = html_splitter.get_span(content_offset, date_index + content_offset)

	content_offset += date_index

	# strip out any transition characters between the date and the content
	m = re.search(u'^[\s\-–—:\.]+', s[content_offset:])
	if m:
		content_offset += m.end()

	content = '' if content_offset >= len(s) \
		else html_splitter.get_span(content_offset, len(s))

	return (date, date_string, content)
Esempio n. 6
0
	def test_ranges(self):
		splitter = HtmlSplitter(self.data)
		top_level_ranges = splitter._top_level_ranges
		self.assertEqual(
			[r['range'] for r in top_level_ranges],
			[(0, 5), (5, 21), (21, 29)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				0, 5)],
			[(0, 5)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				22, 29)],
			[(21, 29)])
		self.assertEqual(
			[r['range'] for r in splitter._get_applicable_ranges(
				top_level_ranges,
				3, 7)],
			[(0, 5), (5, 21)])
Esempio n. 7
0
	def test_empty(self):
		splitter = HtmlSplitter('')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'')
		self.assertEqual(splitter.get_span(-5, 5), u'')

		splitter = HtmlSplitter('<p></p>')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'<p></p>')
		self.assertEqual(splitter.get_span(-5, 5), u'<p></p>')

		splitter = HtmlSplitter('<p>hello</p><p></p><p>there</p>')

		self.assertEqual(splitter.get_span(0, 0), u'')
		self.assertEqual(splitter.get_span(0, 5), u'<p>hello</p>')
		self.assertEqual(splitter.get_span(-5, 7), u'<p>hello</p><p></p><p>th</p>')
Esempio n. 8
0
	def test_out_of_range(self):
		splitter = HtmlSplitter('<p>blah</p>')

		self.assertEqual(splitter.get_span(0, 200), u'<p>blah</p>')
		self.assertEqual(splitter.get_span(-5, 3), u'<p>bla</p>')
Esempio n. 9
0
	def test_span3(self):
		splitter = HtmlSplitter('<p class="blah">abcdefghijkl</p>')
		self.assertEqual(unicode(splitter.get_span(2, 5)),
			u'<p class="blah">cde</p>')
		self.assertEqual(unicode(splitter.get_span(5, 12)),
			u'<p class="blah">fghijkl</p>')