def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match( '[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def _paragraphs(self, soup): lines = [] refs = False for e in soup.find_all(['p', 'table']): t = text(e) if e.name == 'table': if re.match('[\-\s]+excl\s+', t) \ or re.match('[\-\s]*1\.[^0-9]+[0-9]+\s+2\.\s+', t): e.replace_with(new_tag(soup, 'p', 'value-table')) break if e.name == 'p': if re.match('1\.\s+(.+?)\s+[0-9]+$', t): ex = [] for p in next_siblings(e): tt = text(p) if p.name != 'p' or not re.match('[0-9]\.\s+(.+?)\s+[0-9]+$', tt): break ex.append(p) if ex: for ee in ex: ee.extract() e.replace_with(new_tag(soup, 'p', 'value-table')) break for e, t in tag_and_text(soup.find_all(['p', 'table', 'ol', 'ul'])): if e.parent.name in ['li', 'td']: continue #print t br = t == self.BR if t in ['References', 'Reference']: refs = True t = '' elif not lines and re.match('[0-9]+\.\s+[A-Za-z]+(\s+[A-Za-z]+)*$', t): e.name = 'h3' elif not lines and re.match('[0-9]+\.[0-9]+\.\s+[A-Z]', t): e.name = 'h4' elif t.endswith('and the APiCS Consortium'): continue if br and not refs: if lines: yield Paragraph(lines) lines = [] if t and t != self.BR: lines.append((e, t, e.name)) if lines: yield Paragraph(lines, refs=refs)
def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text(descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1, ) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements ] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [ s for s in re.split(',|&| and ', text(top_level_elements[1])) ] remove(*top_level_elements[:3]) refs = soup.find( lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append( self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup
def test_next_siblings(self): from souplib import next_siblings self.assertEquals(next_siblings(self.soup.a), [])
def refactor(self, soup, md): # clean attributes: def update_style(current): style = [] for rule in (current or '').split(';'): rule = rule.strip() # tab-stops:14.2pt text-indent:36.0pt if rule in ['tab-stops:14.2pt', 'text-indent:36.0pt']: rule = 'margin-top:0.4em' if normalize_whitespace(rule, repl='') in [ 'font-family:Junicode', 'font-family:JunicodeRegular', ]: continue if rule and not rule.startswith('mso-'): style.append(rule) return ';'.join(style) for e in descendants(soup.find('body')): update_attr(e, 'style', update_style) update_attr(e, 'lang', None) for e, t in tag_and_text( descendants(soup.find('body'), include=['p', 'h1', 'h2']), non_empty=False): if not t: e.extract() for p in soup.find_all('p'): if p.attrs.get('class') == ['Zitat']: p.wrap(soup.new_tag('blockquote')) continue if not p.parent.name == 'td': # need to detect headings by text, too! t = text(p) match = self.heading_pattern.match(t.lower()) if match: p.name = 'h2' if match.group('sub') else 'h1' # re-classify section headings: for i in range(1, 3): for p in soup.find_all('h%s' % i): p.name = 'h%s' % (i + 1,) for p in soup.find_all('a'): if p.attrs.get('name', '').startswith('OLE_LINK'): p.unwrap() top_level_elements = children(soup.find('div'))[:4] if '.' in self.id: try: assert [e.name for e in top_level_elements] == ['p', 'p', 'table', 'h3'] except: print top_level_elements[0] print top_level_elements[1] print top_level_elements[3] raise md['title'] = text(top_level_elements[0]) md['authors'] = [s for s in re.split(',|&| and ', text(top_level_elements[1]))] remove(*top_level_elements[:3]) refs = soup.find(lambda e: e.name == 'h3' and text(e).startswith('References')) if refs: ex = [] category = None for e, t in tag_and_text(next_siblings(refs)): if e.name == 'p': if t in REFERENCE_CATEGORIES: category = t elif len(t.split()) < 3: raise ValueError(t) else: if 'comment' in e.attrs.get('class', []): if 'refs_comments' not in md: md['refs_comments'] = [t] else: md['refs_comments'].append(t) else: if not YEAR.search(t): print t md['refs'].append(self.get_ref(e, category=category)) ex.append(e) elif e.name in ['h3', 'h4']: category = t ex.append(e) [e.extract() for e in ex + [refs]] for t in soup.find_all('table'): t.wrap(soup.new_tag('div', **{'class': 'table'})) return soup