Beispiel #1
0
def parse(body):
	document = Selector(text=body)
	title = document.css('title::text').extract_first()

	xpath = '//table[.//table[.//a[contains(text(), "%s")]]]' % title
	table = document.xpath(xpath)[-1]

	detection_strings = ['_____', 'Photographs may appear out of order']
	sanitize_regex = [
		('</strong>\s*<strong>', ''),
		('>\d{1,2}\s*. ', '>'),
		('<strong>\s*</strong>', ''),
	]

	briefing = create_briefing(table)

	pieces = []
	piece = initiate_piece()
	for row in table.css('tr'):
		image_tag = row.css('td img::attr(src)').extract_first()
		if image_tag:
			piece['image'] = image_tag.replace('-articleLarge.jpg', '-superJumbo.jpg')

			caption = row.css('td span::text').extract_first()
			piece['imageCaption'] = caption.encode('utf-8').strip() if caption else ''

			continue

		if [x for x in row.css('em::text, td::text').extract() for y in detection_strings if y in x.strip()]:
			if piece.get('title') and piece.get('image'):
				piece['pieceTextContent'] = piece['pieceTextContent'].strip('\n\n')
				pieces.append(piece)

			piece = initiate_piece()
			continue

		headline = row.css('strong::text').extract()
		if piece.get('image') or headline:
			inner_html = row.css('td').extract_first()
			inner_html = clean_tags(['td'], inner_html)

			inner_html = re.sub('title="(.*?)"', '', inner_html)

			if not piece.get('number'):
				headline = ' '.join(headline)
				piece['number'], piece['title'] = [x.strip() for x in headline.split('.', 1)]

				for regex in sanitize_regex:
					inner_html = re.sub(regex[0], regex[1], inner_html)

				piece['number'] = int(piece['number'].replace(' ', ''))
				if piece['title'] == '':
					piece['title'] = str(piece['number'])

			content = '\n\n' + html2text.handle(inner_html).strip()
			piece['pieceTextContent'] += remove_link_params(content, row)

	return briefing, pieces
Beispiel #2
0
    def parse_link(self, url, file, tags_to_search=['p']):
        # print(f'grabbing data from {url}')
        response = self.get_html(url)
        if response is None: return

        soup = BeautifulSoup(response, 'lxml')
        text_bucket = []
        for tag in soup.findAll(tags_to_search):
            # print(repr(tag.contents))
            text = html2text.handle(tag.text)
            text = re.sub(r'https?:\/\/.*[\r\n]*',
                          '',
                          text,
                          flags=re.MULTILINE)
            text = re.sub(r"[^a-zA-Z0-9.!?,;:@()'’]", ' ', text)
            text_bucket.append(text.strip())

        # print(repr("\n".join(text_bucket)))
        file.write("\n===\n")
        file.write("\n".join(text_bucket))
Beispiel #3
0
 def markdown(self):
     """Markdown representation of the element."""
     return html2text.handle(self.html)