def _extract_table(self, html): """parse html page and process the calendar table intermediate python structure (event list) is returned""" soup = BeautifulSoup(html) tables = soup.html.body.findAll(name="table", recursive=False) # jump to the calendar table cal = tables[1] lines = cal.findAll(name="tr", recursive=False) # isolate first tab line with hours hours_line = lines[0].findChildren(name="td", recursive=False)[1:] hours = insert_halfhour_slots_and_convert_to_datetime(hours_line) # process all lines # search the number of row for that day n_rows = [] for (no_line, line) in enumerate(lines[1:]): slots = line.findAll(name="td", recursive=False) # search the number of row for that day if slots[0].has_key("rowspan"): n_rows.append(int(slots[0]["rowspan"])) else: n_rows.append(0) event_list = [] day = -1 n = 0 for (no_line, line) in enumerate(lines[1:]): if not n: n = n_rows[no_line] day += 1 current_time = -1 else: current_time = 0 n -= 1 slots = line.findAll(name="td", recursive=False) for s in slots: cell = s.findAll(name="table", recursive=False) # event found if len(cell) > 1: event_data = {"day": day, "start_time": hours[current_time], "duration": int(s["colspan"])} # duration in hours is extract from the colspan # compute end time (1 colspan=1/2 hour) delta = timedelta(hours=event_data["duration"] / 2) event_data["stop_time"] = hours[current_time] + delta td = cell[0].tr.findAll(name="td", recursive=False) # Gehol weeks when the event occurs event_data["weeks"] = split_weeks(td[0].contents[0].string) # location event_data["location"] = td[1].contents[0].string if not event_data["location"]: event_data["location"] = "" # activity event_data["type"] = cell[1].tr.td.contents[0].string current_time = current_time + event_data["duration"] event_data["organizer"] = "" event_data["title"] = "%s - %s" % (self.metadata["mnemo"], self.metadata["title"]) course_event = CourseEvent(**event_data) event_list.append(course_event) else: current_time += 1 return event_list
def _load_events(self, event_table): all_rows = event_table.findChildren("tr", recursive=False) # get the column labels, save as actual hours objects hours_row = all_rows[0].findChildren("td", recursive=False) hours = insert_halfhour_slots_and_convert_to_datetime(hours_row[1:]) # get the events for each day event_rows = all_rows[1:] self.events = [] rows_per_day = self._get_num_row_per_day(event_rows) current_row_index = 0 for (num_day, day_string, num_rows) in rows_per_day: day_events = [] for day_subrow in range(num_rows): current_day_index = current_row_index + day_subrow events_in_row = self._load_weekday_events(event_rows[current_day_index], num_day, hours) day_events.extend(events_in_row) self.events.extend(day_events) current_row_index += num_rows