class Parser(object): """ A Wikipedia parser. Captures a parsed Wikipedia page. """ def __init__(self, url): self.html = Html(url = url) self.html_lines = self.html.lines() self.name = self.get_name() self.tables = {} def get_name(self): """ Returns the name of the Wikipedia page. """ for line in self.html_lines: if line.has(r'<h1.*?>(.*?)</h1>'): return line.strip_tags() def table_at_line(self, n): """ Returns the table at line n. """ assert self.html_lines[n].has(r'<table.*?>') table_lines = self.html_lines[n].string n += 1 while not self.html_lines[n].has(r'</table>'): table_lines += '\n' + self.html_lines[n].string n += 1 table_lines += '\n' + self.html_lines[n].string return Html(table_lines) def name_of_table_at_line(self, n): """ Returns the title of the table at line n. """ assert self.html_lines[n].has(r'<table.*?>') while not self.html_lines[n].has(r'<h.*?>(.*?)</h.*?>'): n -= 1 return self.html_lines[n].strip_tags().replace('[edit]', '') def all_tables(self): tables, i = [], 0 while i < len(self.html_lines): if self.html_lines[i].has(r'<table.*?>'): name = self.name_of_table_at_line(i) table = self.table_at_line(i) if name not in ['Bibliography', 'See also']: tables.append(Table(name, table)) i += len(table.lines()) i += 1 return tables
def __init__(self, url): self.html = Html(url = url) self.html_lines = self.html.lines() self.name = self.get_name() self.tables = {}