コード例 #1
0
ファイル: parser.py プロジェクト: alexyku/WikiScrape
class Parser(object):
    """ A Wikipedia parser. Captures a parsed Wikipedia page. """
    def __init__(self, url):
        self.html = Html(url = url)
        self.html_lines = self.html.lines()
        self.name = self.get_name()
        self.tables = {}

    def get_name(self):
        """ Returns the name of the Wikipedia page. """
        for line in self.html_lines:
            if line.has(r'<h1.*?>(.*?)</h1>'):
                return line.strip_tags()

    def table_at_line(self, n):
        """ Returns the table at line n. """
        assert self.html_lines[n].has(r'<table.*?>')
        table_lines = self.html_lines[n].string
        n += 1
        while not self.html_lines[n].has(r'</table>'):
            table_lines += '\n' + self.html_lines[n].string
            n += 1
        table_lines += '\n' + self.html_lines[n].string
        return Html(table_lines)

    def name_of_table_at_line(self, n):
        """ Returns the title of the table at line n. """
        assert self.html_lines[n].has(r'<table.*?>')
        while not self.html_lines[n].has(r'<h.*?>(.*?)</h.*?>'):
            n -= 1
        return self.html_lines[n].strip_tags().replace('[edit]', '')

    def all_tables(self):
        tables, i = [], 0
        while i < len(self.html_lines):
            if self.html_lines[i].has(r'<table.*?>'):
                name = self.name_of_table_at_line(i)
                table = self.table_at_line(i)
                if name not in ['Bibliography', 'See also']:
                    tables.append(Table(name, table))
                i += len(table.lines())
            i += 1
        return tables
コード例 #2
0
ファイル: parser.py プロジェクト: alexyku/WikiScrape
 def __init__(self, url):
     self.html = Html(url = url)
     self.html_lines = self.html.lines()
     self.name = self.get_name()
     self.tables = {}