def _read(self, raw_table): th_nodes = raw_table.contents.filter_tags(matches=ftag('th')) for th in th_nodes: self.head.append(th.contents.strip_code().strip(' ')) raw_table.contents.remove(th) log.debug('parsed {} columns from table {}'.format(len(th_nodes), self.name)) for tr in raw_table.contents.ifilter_tags(matches=ftag('tr')): row = Row(self.head, tr) if not row.is_null: self.rows.append(row) log.debug('parsed {} rows from table {}'.format(len(self.rows), self.name))
def _read(self, raw_table): th_nodes = raw_table.contents.filter_tags(matches=ftag('th')) for th in th_nodes: self.head.append(th.contents.strip_code().strip(' ')) raw_table.contents.remove(th) log.debug('parsed %d columns from table %s' % \ (len(th_nodes), self.name)) for tr in raw_table.contents.ifilter_tags(matches=ftag('tr')): row = Row(self.head, tr) if not row.is_null: self.rows.append(row) log.debug('parsed %d rows from table %s' % \ (len(self.rows), self.name))
def __init__(self, name, raw_table): self.name = ustr(name) self.rows = [] self._head = [] self._node = raw_table self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr')) self._read(raw_table)
def _parse(self, node): rname = '%s[%s]' % (self._tname, self._idx) self._idx += 1 r = Row(rname, node) cols = node.contents.ifilter_tags(matches=ftag('th', 'td')) fields = [f for col in cols for f in self._freader.parse(col)] for col_name in self.head: if self._nspan[col_name]: r[col_name] = self._span[col_name] self._nspan[col_name] -= 1 continue if not fields: log.warn('%s: missing field for column [%s]' % (r.name, col_name)) continue f = fields.pop(0) if 'rowspan' in f.attrs: self._span[col_name] = f self._nspan[col_name] = int(f.attrs['rowspan']) - 1 r[col_name] = f for f in fields: log.warn('%s: dropping field from unknown column: %s' % (r.name, f)) return r
def _parse(self, node): rname = '%s[%s]' % (self._tname, self._idx) self._idx += 1 r = Row(rname, node) cols = node.contents.ifilter_tags(matches=ftag('th', 'td')) fields = [ f for col in cols for f in self._freader.parse(col) ] for col_name in self.head: if self._nspan[col_name]: r[col_name] = self._span[col_name] self._nspan[col_name] -= 1 continue if not fields: log.warn('%s: missing field for column [%s]' % (r.name, col_name)) continue f = fields.pop(0) if 'rowspan' in f.attrs: self._span[col_name] = f self._nspan[col_name] = int(f.attrs['rowspan'])-1 r[col_name] = f for f in fields: log.warn('%s: dropping field from unknown column: %s' % (r.name, f)) return r
def __init__(self, name, raw_table): self.name = ustr(name) self.rows = [] self._head = [] self._node = raw_table self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr')) self._read_header() self._read_rows()
def _read(head, node): cols = list(node.contents.ifilter_tags(matches=ftag('th', 'td'))) # check to see if number of cells in rows match header if len(head) == len([ str(Field(c)) for c in cols ]): r = zip(head, [ {"value": str(Field(c)), "link": Field(c).link} for c in cols ]) return r else: return False
def import_tables(article, lang="en"): client = Client(lang) page = client.fetch_page(article) body = page['revisions'][0]['*'] extract = client.fetch_extract(article) parsed_body = mwp.parse(body, skip_style_tags=True) table_extracted = False tables_info = nested_dict() tables_info['title'] = page['title'] tables_info['extract'] = extract ## get sections sections = parsed_body.get_sections(include_lead=False, include_headings=True, flat=True) print(page['title']) section_count = 0 for idx, s in enumerate(sections): section_table = False t = s.filter_tags(matches=ftag('table')) if t: head = mwp.parse(s.filter_headings()[0]) tables_info[str(section_count)]["head"] = head.strip_code() s.remove(head) table_count = 0 for i, x in enumerate(t): name = '{}|Table {}'.format(page['title'], table_count) wt = WikiTable(name, x) if not wt.flag: table_extracted = True section_table = True tables_info[str(section_count)]["table"][str( table_count)]["rows"] = [dict(r) for r in wt.rows] tables_info[str(section_count)]["table"][str( table_count)]["head"] = wt.head tables_info[str(section_count)]["table"][str( table_count)]["rows_count"] = wt.rows_len tables_info[str(section_count)]["table"][str( table_count)]["cols_count"] = wt.head_len table_count += 1 # hack, only remove if table exists try: s.remove(x) except: pass tables_info[str(section_count)]["text"] = s.strip_code() if section_table: section_count += 1 else: del tables_info[str(section_count)] if table_extracted: return tables_info return None
def _find_header_row(self): """ Evaluate all rows and determine header position, based on greatest number of 'th' tagged elements """ th_max = 0 header_idx = 0 for idx, tr in enumerate(self._tr_nodes): th_count = len(tr.contents.filter_tags(matches=ftag('th'))) if th_count > th_max: th_max = th_count header_idx = idx self._log('found header at row %d (%d <th> elements)' % \ (header_idx, th_max)) header_row = self._tr_nodes.pop(header_idx) return header_row.contents.filter_tags(matches=ftag('th'))
def __init__(self, name, raw_table): self.name = ustr(name) self.rows = [] self._head = [] self._node = raw_table self._tr_nodes = raw_table.contents.filter_tags(matches=ftag('tr')) # hack to determine whether or not to return table self.flag = False self._read(raw_table)
def _find_header_flat(self): """ Find header elements in a table, if possible. This case handles situations where '<th>' elements are not within a row('<tr>') """ nodes = self._node.contents.filter_tags( matches=ftag('th'), recursive=False) if not nodes: return self._log('found header outside rows (%d <th> elements)' % len(nodes)) return nodes
def _find_header_row(self): """ Evaluate all rows and determine header position, based on greatest number of 'th' tagged elements """ th_max = 0 header_idx = 0 for idx, tr in enumerate(self._tr_nodes): th_count = len(tr.contents.filter_tags(matches=ftag('th'))) if th_count > th_max: th_max = th_count header_idx = idx if not th_max: return self._log('found header at row %d (%d <th> elements)' % \ (header_idx, th_max)) header_row = self._tr_nodes.pop(header_idx) return header_row.contents.filter_tags(matches=ftag('th'))
def _make_default_header(self): """ Return a generic placeholder header based on the tables column count """ td_max = 0 for idx, tr in enumerate(self._tr_nodes): td_count = len(tr.contents.filter_tags(matches=ftag('td'))) if td_count > td_max: td_max = td_count self._log('creating default header (%d columns)' % td_max) return ['column%d' % n for n in range(0, td_max)]
def _make_default_header(self): """ Return a generic placeholder header based on the tables column count """ td_max = 0 for idx, tr in enumerate(self._tr_nodes): td_count = len(tr.contents.filter_tags(matches=ftag('td'))) if td_count > td_max: td_max = td_count self._log('creating default header (%d columns)' % td_max) return [ 'column%d' % n for n in range(0,td_max) ]
def import_tables(article, lang="en"): client = Client(lang) page = client.fetch_page(article) body = page['revisions'][0]['*'] ## parse for tables raw_tables = mwp.parse(body).filter_tags(matches=ftag('table')) def _table_gen(): for idx, table in enumerate(raw_tables): name = '%s[%s]' % (page['title'],idx) yield WikiTable(name, table) return list(_table_gen())
def _read(head, node): cols = node.contents.ifilter_tags(matches=ftag('th', 'td')) return zip(head, [Field(c) for c in cols])
def _read(head, node): cols = node.contents.ifilter_tags(matches=ftag('td')) return zip(head, [ Field(c) for c in cols ])
def parse_programs(self): """Parse table with descriptions for program, strategies and names. Assumes a wikipage with a table formatted in a particular way, with cells spanning mutiple rows and HTML comments containing some of the information. An instance of such a table can be found on: https://se.wikimedia.org/w/index.php?title=Verksamhetsplan_2019/Tabell_%C3%B6ver_program,_strategi_och_m%C3%A5l&oldid=75471. """ operational_plan_page = Page( self._site, self._make_year_title( self._config["year_pages"]["operational_plan"])) # Get table string. This assumes that it is the first table on # the page. table_string = str( mwp.parse(operational_plan_page.text).filter_tags( matches=ftag('table'))[0]) # Remove ref tags and links. table_string = re.sub(r"(<ref.*?>.*?</ref>|\[\[.*?\||\]\])", "", table_string, flags=re.S) remaining_projects = list(self._projects.keys()) # Split table on rows. rows = table_string.split("|-") for row in rows[1:]: # Skip first rows; we don't need the headers. if not row.rstrip("|}").strip(): # This is just the end table row, skip it. continue # Split rows on pipes and remove formatting. cells = list( filter( None, map(lambda c: c.split("|")[-1].strip(), re.split(r"[\|\n]\|", row)))) if len(cells) == 3: # Row includes program. program_name, program_number = \ re.match(r"(.*)\s+<!--\s*(.*)\s*-->", cells[0]).groups() self._programs.append({ "number": program_number, "name": program_name, "strategies": [] }) if len(cells) >= 2: # Row includes strategy, which is always in the cell # second from the right. strategy, strategy_number, strategy_short = \ re.match( r"(.*)\s*<!--\s*(\d+)\s*(.*)\s-->", cells[-2] ).groups() self._programs[-1]["strategies"].append({ "number": strategy_number, "description": strategy, "short_description": strategy_short, "projects": [], "goals": [] }) for project in self._get_projects_for_strategy( strategy_number): # Add projects for this strategy. self._programs[-1]["strategies"][-1]["projects"].append( project) remaining_projects.remove(project) # The rightmost cell always contains a goal. goal = cells[-1] self._programs[-1]["strategies"][-1]["goals"].append(goal) if remaining_projects: logging.warning( "There were projects which could not be matched to programs, " "these will be skipped from overview pages: '{}'".format( ', '.join(remaining_projects)))