def string_blocks_to_events(string_blocks, p = None): """Given a set of string blocks (as produced by html_to_string_blocks, expects that all strings are non-empty), returns a list of timeline events. A timeline event is {date: number, date_string: string, content: string} """ curr_ignore_sections = _ignore_sections.copy() p = param_defaults(p or {}) def section_test(name): if p['single_section']: return name.strip().lower() == p['single_section'].strip().lower() else: return name.strip().lower() not in curr_ignore_sections if all(not section_test(sb['heading'][0]) for sb in string_blocks): # allow the first section to be processed if it is the only section, # excluding excluded sections like see also, etc. Usually this section # is just an intro paragraph, but if this if statement is true, it is # probably the entire content of the article try: curr_ignore_sections.remove('') except KeyError: pass if p['extra_ignore_sections']: for s in p['extra_ignore_sections'].split('&'): curr_ignore_sections.add(s.lower().strip()) curr_event = None events = [] for string_block in string_blocks: prev_date = None if section_test(string_block['heading'][0]): # create base date based on headings: # possible perf improvement by caching results for headings across string_blocks base_date = TimelineDate(TimePoint()) base_date_string = '' for h in string_block['heading']: parse = parse_date_html(h) if parse: base_date = TimelineDate.combine(base_date, parse[0]) base_date_string = parse[1] # if there's a year specified in the headings, we create a fuzzy # range that child elements of those headings need to fall in base_date_range = None if base_date.start_year() != None: delta_minus = 10 delta_plus = 20 m = re.search(ur'0+$', str(base_date.start.year)) if m: delta_minus = int('1' + ('0' * (m.end() - m.start()))) delta_plus = delta_minus * 2 base_date_range = (base_date.start_year() - delta_minus, base_date.start_year() + delta_plus) for line in string_block['lines']: if line['line_type'] == LineTypes.line: parse = parse_date_html(line['line']) # if we can parse a date, create a new event if parse and \ ((not base_date_range) or \ (parse[0].start_year() == None) or \ (base_date_string.lower().strip() == 'antiquity') or \ (parse[0].start_year() >= base_date_range[0] and \ parse[0].start_year() <= base_date_range[1]) or \ (TimelineDate.can_combine_as_day(base_date, parse[0])) ): _close_event(events, curr_event) date = parse[0] if date.start_year() == None and prev_date: # this is the case where we have a month or # monthday but no year. in this case, take it from # the previous event date = TimelineDate.combine(prev_date, date) date = TimelineDate.combine(base_date, date) curr_event = { 'date': date.start_year(), 'date_length': date.length(), 'date_string': parse[1], 'content': parse[2] } prev_date = date # if we can't parse a date, append the line to the # current event if there is one elif curr_event: if p['continuations']: curr_event['content'] += _line_break + line['line'] else: _close_event(events, curr_event) curr_event = { 'date': curr_event['date'], 'date_length': curr_event['date_length'], 'date_string': curr_event['date_string'], 'content': line['line'] } # if there's no parse and no current event, see if we can # use the base_date elif base_date.start_year() != None: # no need to close events because curr_event is None curr_event = { 'date': base_date.start_year(), 'date_length': base_date.length(), 'date_string': base_date_string, 'content': line['line'] } elif line['line_type'] == LineTypes.table: _close_event(events, curr_event) events += _table_to_events(line['line'], base_date, p) curr_event = None _close_event(events, curr_event) curr_event = None return events
def _table_to_events(table, base_date, p = None): """Given a table html element as a BeautifulSoup, returns a list of """ p = param_defaults(p or {}) def get_rowspan(td): s = td.get('rowspan') if s == None: return None try: i = int(s) except ValueError: return None if i >= 0: return i else: return None events = [] year_col_index = None date_col_index = None for row in table.find_all('tr'): cells = row.find_all('th') for i, cell in enumerate(cells): cell_text = cell.get_text().strip().lower() if cell_text == 'year': year_col_index = i elif cell_text == 'date': date_col_index = i if date_col_index != None and year_col_index == None: year_col_index = date_col_index date_col_index = None if year_col_index == None and date_col_index == None: # just try using the first column. could be a bit smarter about giving # up early to save some cycles... year_col_index = 0 if year_col_index != None or date_col_index != None: # a td that has a rowspan will be stored as (col_index, cell) The # rowspan number essentially gets decremented in the td element each # time it is added to the subsequent row rowspans = [] # only used if split_within_row is True open_rowspans = {} for row in table.find_all('tr'): cells = row.find_all('td') # first, apply existing rowspans for (i, cell) in rowspans: if get_rowspan(cell) > 0: cells.insert(i, cell) # then, recollect existing and new rowspans rowspans = [] for (i, cell) in enumerate(cells): rs = get_rowspan(cell) if rs: cell['rowspan'] = rs - 1 rowspans.append((i, cell)) if len(cells) == 0 and len(row.find_all('th')) == 1: cells = row.find_all('th') if len(cells) == 1: extract = parse_date_html(_bs_inner_html(cells[0])) if extract: base_date = TimelineDate.combine(base_date, extract[0]) events.append({ 'date': base_date.start_year(), 'date_length': base_date.length(), 'date_string': extract[1], 'content': extract[2] }) elif len(cells) > year_col_index: extract = parse_date_html(_bs_inner_html(cells[year_col_index])) if extract: date = extract[0] date_string = extract[1] if date_col_index != None and len(cells) > date_col_index: extract2 = parse_date_html(_bs_inner_html(cells[date_col_index])) if extract2: date = TimelineDate.combine(date, extract2[0]) date_string += ' ' + extract2[1] date = TimelineDate.combine(base_date, date) content_cells = [cell for (i, cell) in \ enumerate(cells) if i != year_col_index and i != date_col_index] if p['keep_row_together']: content = ' '.join(_bs_inner_html(cell) for cell in content_cells) events.append({ 'date': date.start_year(), 'date_length': date.length(), 'date_string': date_string, 'content': content }) else: # deal with rowspan cells rowspan_cells = [cell for cell in content_cells if get_rowspan(cell) != None] for cell in rowspan_cells: if _bs_inner_html(cell) not in open_rowspans: open_rowspans[_bs_inner_html(cell)] = (date, date_string) elif get_rowspan(cell) <= 0: # and in open_rowspans, implicitly rowspan_start = open_rowspans[_bs_inner_html(cell)] rowspan_date = TimelineDate.span_from_dates(rowspan_start[0], date) rowspan_date_string = rowspan_start[1] + ' - ' + date_string for line in _lines_from_html(cell): events.append({ 'date': rowspan_date.start_year(), 'date_length': rowspan_date.length(), 'date_string': rowspan_date_string, 'content': line }) # deal with non-rowspan cells for cell in content_cells: if get_rowspan(cell) == None: for line in _lines_from_html(cell): events.append({ 'date': date.start_year(), 'date_length': date.length(), 'date_string': date_string, 'content': line }) return events
def test_one(self): print(TimePoint.combine(TimePoint(1920, 4), TimePoint(None, 5, 3))) print(TimePoint.combine(TimePoint(1920), TimePoint(None, 4))) print(TimelineDate.combine(TimelineDate(TimePoint(1920, 8)), TimelineDate(TimePoint(4))))