Esempio n. 1
0
 def _read(self, raw_table):
     th_nodes = raw_table.contents.filter_tags(matches=ftag('th'))
     for th in th_nodes:
         self.head.append(th.contents.strip_code().strip(' '))
         raw_table.contents.remove(th)
     log.debug('parsed {} columns from table {}'.format(len(th_nodes), self.name))
 
     for tr in raw_table.contents.ifilter_tags(matches=ftag('tr')):
         row = Row(self.head, tr)
         if not row.is_null:
             self.rows.append(row)
     log.debug('parsed {} rows from table {}'.format(len(self.rows), self.name))
Esempio n. 2
0
    def _read(self, raw_table):
        th_nodes = raw_table.contents.filter_tags(matches=ftag('th'))
        for th in th_nodes:
            self.head.append(th.contents.strip_code().strip(' '))
            raw_table.contents.remove(th)
        log.debug('parsed %d columns from table %s' % \
                (len(th_nodes), self.name))

        for tr in raw_table.contents.ifilter_tags(matches=ftag('tr')):
            row = Row(self.head, tr)
            if not row.is_null:
                self.rows.append(row)
        log.debug('parsed %d rows from table %s' % \
                (len(self.rows), self.name))
Esempio n. 3
0
 def __init__(self, name, raw_table):
     self.name = ustr(name)
     self.rows = []
     self._head = []
     self._node = raw_table
     self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr'))
     self._read(raw_table)
Esempio n. 4
0
    def _parse(self, node):
        rname = '%s[%s]' % (self._tname, self._idx)
        self._idx += 1
        r = Row(rname, node)
        cols = node.contents.ifilter_tags(matches=ftag('th', 'td'))
        fields = [f for col in cols for f in self._freader.parse(col)]

        for col_name in self.head:
            if self._nspan[col_name]:
                r[col_name] = self._span[col_name]
                self._nspan[col_name] -= 1
                continue

            if not fields:
                log.warn('%s: missing field for column [%s]' %
                         (r.name, col_name))
                continue

            f = fields.pop(0)
            if 'rowspan' in f.attrs:
                self._span[col_name] = f
                self._nspan[col_name] = int(f.attrs['rowspan']) - 1

            r[col_name] = f

        for f in fields:
            log.warn('%s: dropping field from unknown column: %s' %
                     (r.name, f))

        return r
Esempio n. 5
0
    def _parse(self, node):
        rname = '%s[%s]' % (self._tname, self._idx)
        self._idx += 1
        r = Row(rname, node)
        cols = node.contents.ifilter_tags(matches=ftag('th', 'td'))
        fields = [ f for col in cols for f in self._freader.parse(col) ]

        for col_name in self.head:
            if self._nspan[col_name]:
                r[col_name] = self._span[col_name]
                self._nspan[col_name] -= 1
                continue

            if not fields:
                log.warn('%s: missing field for column [%s]' % (r.name, col_name))
                continue

            f = fields.pop(0)
            if 'rowspan' in f.attrs:
                self._span[col_name] = f
                self._nspan[col_name] = int(f.attrs['rowspan'])-1

            r[col_name] = f

        for f in fields:
            log.warn('%s: dropping field from unknown column: %s' % (r.name, f))

        return r
Esempio n. 6
0
 def __init__(self, name, raw_table):
     self.name = ustr(name)
     self.rows = []
     self._head = []
     self._node = raw_table
     self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr'))
     self._read_header()
     self._read_rows()
Esempio n. 7
0
 def _read(head, node):
     cols = list(node.contents.ifilter_tags(matches=ftag('th', 'td')))
     # check to see if number of cells in rows match header
     if len(head) == len([ str(Field(c)) for c in cols ]):
         r = zip(head, [ {"value": str(Field(c)), "link": Field(c).link} for c in cols ])
         return r
     else:
         return False
Esempio n. 8
0
def import_tables(article, lang="en"):
    client = Client(lang)
    page = client.fetch_page(article)
    body = page['revisions'][0]['*']
    extract = client.fetch_extract(article)
    parsed_body = mwp.parse(body, skip_style_tags=True)
    table_extracted = False

    tables_info = nested_dict()

    tables_info['title'] = page['title']
    tables_info['extract'] = extract

    ## get sections
    sections = parsed_body.get_sections(include_lead=False,
                                        include_headings=True,
                                        flat=True)

    print(page['title'])
    section_count = 0
    for idx, s in enumerate(sections):
        section_table = False
        t = s.filter_tags(matches=ftag('table'))
        if t:
            head = mwp.parse(s.filter_headings()[0])
            tables_info[str(section_count)]["head"] = head.strip_code()
            s.remove(head)
            table_count = 0
            for i, x in enumerate(t):
                name = '{}|Table {}'.format(page['title'], table_count)
                wt = WikiTable(name, x)
                if not wt.flag:
                    table_extracted = True
                    section_table = True
                    tables_info[str(section_count)]["table"][str(
                        table_count)]["rows"] = [dict(r) for r in wt.rows]
                    tables_info[str(section_count)]["table"][str(
                        table_count)]["head"] = wt.head
                    tables_info[str(section_count)]["table"][str(
                        table_count)]["rows_count"] = wt.rows_len
                    tables_info[str(section_count)]["table"][str(
                        table_count)]["cols_count"] = wt.head_len
                    table_count += 1
                # hack, only remove if table exists
                try:
                    s.remove(x)
                except:
                    pass
            tables_info[str(section_count)]["text"] = s.strip_code()
            if section_table:
                section_count += 1
            else:
                del tables_info[str(section_count)]

    if table_extracted:
        return tables_info

    return None
Esempio n. 9
0
    def _find_header_row(self):
        """
        Evaluate all rows and determine header position, based on
        greatest number of 'th' tagged elements
        """
        th_max = 0
        header_idx = 0
        for idx, tr in enumerate(self._tr_nodes):
            th_count = len(tr.contents.filter_tags(matches=ftag('th')))
            if th_count > th_max:
                th_max = th_count
                header_idx = idx

        self._log('found header at row %d (%d <th> elements)' % \
                    (header_idx, th_max))

        header_row = self._tr_nodes.pop(header_idx)
        return header_row.contents.filter_tags(matches=ftag('th'))
Esempio n. 10
0
 def __init__(self, name, raw_table):
     self.name = ustr(name)
     self.rows = []
     self._head = []
     self._node = raw_table
     self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr'))
     # hack to determine whether or not to return table
     self.flag = False
     self._read(raw_table)
Esempio n. 11
0
 def _find_header_flat(self):
     """
     Find header elements in a table, if possible. This case handles
     situations where '<th>' elements are not within a row('<tr>')
     """
     nodes = self._node.contents.filter_tags(
                 matches=ftag('th'), recursive=False)
     if not nodes:
         return
     self._log('found header outside rows (%d <th> elements)' % len(nodes))
     return nodes
Esempio n. 12
0
 def _find_header_flat(self):
     """
     Find header elements in a table, if possible. This case handles
     situations where '<th>' elements are not within a row('<tr>')
     """
     nodes = self._node.contents.filter_tags(
                 matches=ftag('th'), recursive=False)
     if not nodes:
         return
     self._log('found header outside rows (%d <th> elements)' % len(nodes))
     return nodes
Esempio n. 13
0
    def _find_header_row(self):
        """
        Evaluate all rows and determine header position, based on
        greatest number of 'th' tagged elements
        """
        th_max = 0
        header_idx = 0
        for idx, tr in enumerate(self._tr_nodes):
            th_count = len(tr.contents.filter_tags(matches=ftag('th')))
            if th_count > th_max:
                th_max = th_count
                header_idx = idx

        if not th_max:
            return

        self._log('found header at row %d (%d <th> elements)' % \
                    (header_idx, th_max))

        header_row = self._tr_nodes.pop(header_idx)
        return header_row.contents.filter_tags(matches=ftag('th'))
Esempio n. 14
0
    def _make_default_header(self):
        """
        Return a generic placeholder header based on the tables column count
        """
        td_max = 0

        for idx, tr in enumerate(self._tr_nodes):
            td_count = len(tr.contents.filter_tags(matches=ftag('td')))
            if td_count > td_max:
                td_max = td_count

        self._log('creating default header (%d columns)' % td_max)
        return ['column%d' % n for n in range(0, td_max)]
Esempio n. 15
0
    def _make_default_header(self):
        """
        Return a generic placeholder header based on the tables column count
        """
        td_max = 0

        for idx, tr in enumerate(self._tr_nodes):
            td_count = len(tr.contents.filter_tags(matches=ftag('td')))
            if td_count > td_max:
                td_max = td_count

        self._log('creating default header (%d columns)' % td_max)
        return [ 'column%d' % n for n in range(0,td_max) ]
Esempio n. 16
0
def import_tables(article, lang="en"):
    client = Client(lang)
    page = client.fetch_page(article)
    body = page['revisions'][0]['*']

    ## parse for tables
    raw_tables = mwp.parse(body).filter_tags(matches=ftag('table'))

    def _table_gen():
        for idx, table in enumerate(raw_tables):
            name = '%s[%s]' % (page['title'],idx)
            yield WikiTable(name, table)

    return list(_table_gen())
Esempio n. 17
0
 def _read(head, node):
     cols = node.contents.ifilter_tags(matches=ftag('th', 'td'))
     return zip(head, [Field(c) for c in cols])
Esempio n. 18
0
 def _read(head, node):
     cols = node.contents.ifilter_tags(matches=ftag('td'))
     return zip(head, [ Field(c) for c in cols ])
Esempio n. 19
0
    def parse_programs(self):
        """Parse table with descriptions for program, strategies and names.

        Assumes a wikipage with a table formatted in a particular way,
        with cells spanning mutiple rows and HTML comments containing
        some of the information. An instance of such a table can be
        found on:
        https://se.wikimedia.org/w/index.php?title=Verksamhetsplan_2019/Tabell_%C3%B6ver_program,_strategi_och_m%C3%A5l&oldid=75471.

        """

        operational_plan_page = Page(
            self._site,
            self._make_year_title(
                self._config["year_pages"]["operational_plan"]))
        # Get table string. This assumes that it is the first table on
        # the page.
        table_string = str(
            mwp.parse(operational_plan_page.text).filter_tags(
                matches=ftag('table'))[0])
        # Remove ref tags and links.
        table_string = re.sub(r"(<ref.*?>.*?</ref>|\[\[.*?\||\]\])",
                              "",
                              table_string,
                              flags=re.S)
        remaining_projects = list(self._projects.keys())
        # Split table on rows.
        rows = table_string.split("|-")
        for row in rows[1:]:
            # Skip first rows; we don't need the headers.
            if not row.rstrip("|}").strip():
                # This is just the end table row, skip it.
                continue
            # Split rows on pipes and remove formatting.
            cells = list(
                filter(
                    None,
                    map(lambda c: c.split("|")[-1].strip(),
                        re.split(r"[\|\n]\|", row))))
            if len(cells) == 3:
                # Row includes program.
                program_name, program_number = \
                    re.match(r"(.*)\s+<!--\s*(.*)\s*-->", cells[0]).groups()
                self._programs.append({
                    "number": program_number,
                    "name": program_name,
                    "strategies": []
                })
            if len(cells) >= 2:
                # Row includes strategy, which is always in the cell
                # second from the right.
                strategy, strategy_number, strategy_short = \
                    re.match(
                        r"(.*)\s*<!--\s*(\d+)\s*(.*)\s-->",
                        cells[-2]
                    ).groups()
                self._programs[-1]["strategies"].append({
                    "number": strategy_number,
                    "description": strategy,
                    "short_description": strategy_short,
                    "projects": [],
                    "goals": []
                })
                for project in self._get_projects_for_strategy(
                        strategy_number):
                    # Add projects for this strategy.
                    self._programs[-1]["strategies"][-1]["projects"].append(
                        project)
                    remaining_projects.remove(project)
            # The rightmost cell always contains a goal.
            goal = cells[-1]
            self._programs[-1]["strategies"][-1]["goals"].append(goal)
        if remaining_projects:
            logging.warning(
                "There were projects which could not be matched to programs, "
                "these will be skipped from overview pages: '{}'".format(
                    ', '.join(remaining_projects)))