def handle(self, response, log, browser, video_id): attr = {'video_id': video_id} title = None if response.data_type == 'soup': soup = response.data with trapped: title = strip_site_name(render_node(soup.head.title), response.url) if self.extra_attr: desc = soup.find('div', id='watch-description-clip') # this describes what we need to scrape.. youtube is awfully structured. # NOTE: this is stupidly slow.. disable if doing any volume. for row in (('uploader', desc, 'p', 'id', 'watch-uploader-info', None), ('summary', desc, 'p', 'id', 'eow-description', None), ('category', desc, 'p', 'id', 'eow-category', None), ('license', desc, 'p', 'id', 'eow-reuse', None), ('views', soup, 'span', 'class', 'watch-view-count', None), ('extras', soup, 'ul', 'id', 'watch-description-extra-info', 'li' ), ('tags', desc, 'ul', 'id', 'eow-tags', 'a' )): with trapped: name, parent, tag, key, val, multi = row node = parent.find(tag, **{key: val}) attr[name] = [render_node(s) for s in node(multi)] if multi else render_node(node) return ScanResult(response=response, override_url=None, title=title, content_type=None, content=None, attr=attr)
def handle(self, response, log, browser, video_id): attr = {'video_id': video_id} title = None if response.data_type == 'soup': soup = response.data with trapped: title = strip_site_name(render_node(soup.head.title), response.url) if self.extra_attr: desc = soup.find('div', id='watch-description-clip') # this describes what we need to scrape.. youtube is awfully structured. # NOTE: this is stupidly slow.. disable if doing any volume. for row in (('uploader', desc, 'p', 'id', 'watch-uploader-info', None), ('summary', desc, 'p', 'id', 'eow-description', None), ('category', desc, 'p', 'id', 'eow-category', None), ('license', desc, 'p', 'id', 'eow-reuse', None), ('views', soup, 'span', 'class', 'watch-view-count', None), ('extras', soup, 'ul', 'id', 'watch-description-extra-info', 'li'), ('tags', desc, 'ul', 'id', 'eow-tags', 'a')): with trapped: name, parent, tag, key, val, multi = row node = parent.find(tag, **{key: val}) attr[name] = [render_node(s) for s in node(multi) ] if multi else render_node(node) return ScanResult(response=response, override_url=None, title=title, content_type=None, content=None, attr=attr)
def summarize_soup(self, soup): """ Experimental: Try to guess where the main content is and return summary text. In it's current form, picking news sites & articles at random, it seems to have just slightly better than a 50% chance of working right. The other time it finds total garbage. Better than nothing for a first attempt, I guess ... """ # first, lean up the html a little bit and then remove every tag # that isn't a <div> or <p> tag. theory being that the latter two are mostly # the ones that define the structure of the document, which is what we # are most interested in. well, its structure relative to whatever text nodes # are left over. html = text.decode(browser.prettify_node(soup.body)) for orig, name in tag_re.findall(html): if name not in ('div', 'p'): html = html.replace(orig, u' ') # put it back into soup form and perform the main logic thingy here. the idea # is to walk each remaining node in the branch and look at the text contents of # each of its *siblings*. the idea is that these would be paragraphs in the article. # this falls apart spectacularly on some sites, and in a very specific way. I thin # certain menus or sidebars are laid out this way too. since we do a basic word count, if # they are large enough, they'll overtake the article. perhaps we can correct for this # in another way.. requires investigation. soup = BeautifulSoup(html) blocks = [] for node in soup.findAll(True): size = 0 for p in node: for el in p: if isinstance(el, NavigableString): size += len(el) if size: blocks.append((size, node)) # now we have a list of nodes & how much text in "paragraph" form they contain. whatever is # the largest we will assume is the intended content, and grab a cleaned up snippet from # the front of it. if blocks: article = browser.render_node(max(blocks)[1]) words = article[:self.summary_size].split() if len(article) > self.summary_size: words[-1] = self.summary_cont return u' '.join(words)
def summarize_soup(self, soup): """ Experimental: Try to guess where the main content is and return summary text. In it's current form, picking news sites & articles at random, it seems to have just slightly better than a 50% chance of working right. The other time it finds total garbage. Better than nothing for a first attempt, I guess ... """ # first, lean up the html a little bit and then remove every tag # that isn't a <div> or <p> tag. theory being that the latter two are mostly # the ones that define the structure of the document, which is what we # are most interested in. well, its structure relative to whatever text nodes # are left over. html = text.decode(browser.prettify_node(soup.body)) for orig, name in tag_re.findall(html): if name not in ('div', 'p'): html = html.replace(orig, u' ') # put it back into soup form and perform the main logic thingy here. the idea # is to walk each remaining node in the branch and look at the text contents of # each of its *siblings*. the idea is that these would be paragraphs in the article. # this falls apart spectacularly on some sites, and in a very specific way. I thin # certain menus or sidebars are laid out this way too. since we do a basic word count, if # they are large enough, they'll overtake the article. perhaps we can correct for this # in another way.. requires investigation. soup = BeautifulSoup(html) blocks = [] for node in soup.findAll(True): size = 0 for p in node: for el in p: if isinstance(el, NavigableString): size += len(el) if size: blocks.append((size, node)) # now we have a list of nodes & how much text in "paragraph" form they contain. whatever is # the largest we will assume is the intended content, and grab a cleaned up snippet from # the front of it. if blocks: article = browser.render_node(max(blocks)[1]) words = article[:self.summary_size].split() if len(article) > self.summary_size: words[-1] = self.summary_cont return u' '.join(words)
def handle(self, response, log, browser): if response.data_type != 'soup': raise InvalidContent(response, 'Not an HTML file') soup = response.data title = None with trapped: title = strip_site_name(render_node(soup.head.title), response.url) with trapped: url = soup.head.find('link', rel='image_src')['href'] response = browser.open(url, follow_meta_redirect=True) result = super(IMGurScanner, self).handle(response, log, browser) return ScanResult(response=result.response, override_url=result.override_url, title=result.title if title is None else title, content_type=result.content_type, content=result.content, attr=result.attr) raise InvalidContent(response, "Couldn't find the image")
def handle(self, response, log, browser): if response.data_type != 'soup': raise InvalidContent(response, 'Not an HTML file') soup = response.data title = summary = content_type = None with trapped: title = strip_site_name(render_node(soup.head.title), response.url) with trapped: summary = self.summarize_soup(soup) content_type = 'text/plain' if title is None and summary is None and content_type is None: raise InvalidContent("couldn't get anything useful out of that..") return ScanResult(response=response, override_url=None, title=title, content_type=content_type, content=summary, attr=None)
def handle(self, response, log, browser): if response.data_type != 'soup': raise InvalidContent(response, 'Not an HTML file') soup = response.data title = summary = content_type = None with trapped: title = strip_site_name(render_node(soup.head.title), response.url) with trapped: summary = self.summarize_soup(soup) content_type = 'text/plain' if title is None and summary is None and content_type is None: raise InvalidContent("couldn't get anything useful out of that..") return ScanResult(response=response, override_url=None, title=title, content_type=content_type, content=summary, attr=None)