def parse(body): document = Selector(text=body) title = document.css('title::text').extract_first() xpath = '//table[.//table[.//a[contains(text(), "%s")]]]' % title table = document.xpath(xpath)[-1] detection_strings = ['_____', 'Photographs may appear out of order'] sanitize_regex = [ ('</strong>\s*<strong>', ''), ('>\d{1,2}\s*. ', '>'), ('<strong>\s*</strong>', ''), ] briefing = create_briefing(table) pieces = [] piece = initiate_piece() for row in table.css('tr'): image_tag = row.css('td img::attr(src)').extract_first() if image_tag: piece['image'] = image_tag.replace('-articleLarge.jpg', '-superJumbo.jpg') caption = row.css('td span::text').extract_first() piece['imageCaption'] = caption.encode('utf-8').strip() if caption else '' continue if [x for x in row.css('em::text, td::text').extract() for y in detection_strings if y in x.strip()]: if piece.get('title') and piece.get('image'): piece['pieceTextContent'] = piece['pieceTextContent'].strip('\n\n') pieces.append(piece) piece = initiate_piece() continue headline = row.css('strong::text').extract() if piece.get('image') or headline: inner_html = row.css('td').extract_first() inner_html = clean_tags(['td'], inner_html) inner_html = re.sub('title="(.*?)"', '', inner_html) if not piece.get('number'): headline = ' '.join(headline) piece['number'], piece['title'] = [x.strip() for x in headline.split('.', 1)] for regex in sanitize_regex: inner_html = re.sub(regex[0], regex[1], inner_html) piece['number'] = int(piece['number'].replace(' ', '')) if piece['title'] == '': piece['title'] = str(piece['number']) content = '\n\n' + html2text.handle(inner_html).strip() piece['pieceTextContent'] += remove_link_params(content, row) return briefing, pieces
def parse_link(self, url, file, tags_to_search=['p']): # print(f'grabbing data from {url}') response = self.get_html(url) if response is None: return soup = BeautifulSoup(response, 'lxml') text_bucket = [] for tag in soup.findAll(tags_to_search): # print(repr(tag.contents)) text = html2text.handle(tag.text) text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r"[^a-zA-Z0-9.!?,;:@()'’]", ' ', text) text_bucket.append(text.strip()) # print(repr("\n".join(text_bucket))) file.write("\n===\n") file.write("\n".join(text_bucket))
def markdown(self): """Markdown representation of the element.""" return html2text.handle(self.html)