Exemple #1
0
def string_blocks_to_events(string_blocks, p = None):
	"""Given a set of string blocks (as produced by html_to_string_blocks,
	expects that all strings are non-empty), returns a list of timeline
	events. A timeline event is {date: number, date_string: string, content: string}
	"""

	curr_ignore_sections = _ignore_sections.copy()
	p = param_defaults(p or {})

	def section_test(name):
		if p['single_section']:
			return name.strip().lower() == p['single_section'].strip().lower()
		else:
			return name.strip().lower() not in curr_ignore_sections

	if all(not section_test(sb['heading'][0]) for sb in string_blocks):
		# allow the first section to be processed if it is the only section,
		# excluding excluded sections like see also, etc. Usually this section
		# is just an intro paragraph, but if this if statement is true, it is
		# probably the entire content of the article
		try:
			curr_ignore_sections.remove('')
		except KeyError:
			pass
	if p['extra_ignore_sections']:
		for s in p['extra_ignore_sections'].split('&'):
			curr_ignore_sections.add(s.lower().strip())

	curr_event = None
	events = []

	for string_block in string_blocks:
		prev_date = None
		if section_test(string_block['heading'][0]):
			# create base date based on headings:
			# possible perf improvement by caching results for headings across string_blocks
			base_date = TimelineDate(TimePoint())
			base_date_string = ''
			for h in string_block['heading']:
				parse = parse_date_html(h)
				if parse:
					base_date = TimelineDate.combine(base_date, parse[0])
					base_date_string = parse[1]

			# if there's a year specified in the headings, we create a fuzzy
			# range that child elements of those headings need to fall in
			base_date_range = None
			if base_date.start_year() != None:
				delta_minus = 10
				delta_plus = 20
				m = re.search(ur'0+$', str(base_date.start.year))
				if m:
					delta_minus = int('1' + ('0' * (m.end() - m.start())))
					delta_plus = delta_minus * 2
				base_date_range = (base_date.start_year() - delta_minus, base_date.start_year() + delta_plus)

			for line in string_block['lines']:
				if line['line_type'] == LineTypes.line:
					parse = parse_date_html(line['line'])
					# if we can parse a date, create a new event
					if parse and \
						((not base_date_range) or \
						 (parse[0].start_year() == None) or \
						 (base_date_string.lower().strip() == 'antiquity') or \
						 (parse[0].start_year() >= base_date_range[0] and \
						 	parse[0].start_year() <= base_date_range[1]) or \
						 (TimelineDate.can_combine_as_day(base_date, parse[0]))
						 ):

						_close_event(events, curr_event)
						date = parse[0]
						if date.start_year() == None and prev_date:
							# this is the case where we have a month or
							# monthday but no year. in this case, take it from
							# the previous event
							date = TimelineDate.combine(prev_date, date)
						date = TimelineDate.combine(base_date, date)
						curr_event = {
							'date': date.start_year(),
							'date_length': date.length(),
							'date_string': parse[1],
							'content': parse[2]
						}
						prev_date = date
					# if we can't parse a date, append the line to the
					# current event if there is one
					elif curr_event:
						if p['continuations']:
							curr_event['content'] += _line_break + line['line']
						else:
							_close_event(events, curr_event)
							curr_event = {
								'date': curr_event['date'],
								'date_length': curr_event['date_length'],
								'date_string': curr_event['date_string'],
								'content': line['line']
							}
					# if there's no parse and no current event, see if we can
					# use the base_date
					elif base_date.start_year() != None:
						# no need to close events because curr_event is None
						curr_event = {
							'date': base_date.start_year(),
							'date_length': base_date.length(),
							'date_string': base_date_string,
							'content': line['line']
						}
				elif line['line_type'] == LineTypes.table:
					_close_event(events, curr_event)
					events += _table_to_events(line['line'], base_date, p)
					curr_event = None
			_close_event(events, curr_event)
			curr_event = None

	return events