def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text(descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1, ) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements ] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [ s for s in re.split(',|&| and ', text(top_level_elements[1])) ] remove(*top_level_elements[:3]) refs = soup.find( lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append( self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup
def test_remove(self): from souplib import remove remove(self.soup.a.b, self.soup.a.b.c)
def test_remove2(self): from souplib import remove remove(*self.soup.a.descendants)
def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text( descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1,) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))] remove(*top_level_elements[:3]) refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append(self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup