Example #1
0
    def handle(self, response, log, browser, video_id):
        attr = {'video_id': video_id}

        title = None
        if response.data_type == 'soup':
            soup = response.data
            with trapped:
                title = strip_site_name(render_node(soup.head.title), response.url)
            if self.extra_attr:
                desc = soup.find('div', id='watch-description-clip')

                # this describes what we need to scrape.. youtube is awfully structured.
                # NOTE: this is stupidly slow.. disable if doing any volume.
                for row in (('uploader', desc, 'p',    'id',    'watch-uploader-info',           None),
                            ('summary',  desc, 'p',    'id',    'eow-description',               None),
                            ('category', desc, 'p',    'id',    'eow-category',                  None),
                            ('license',  desc, 'p',    'id',    'eow-reuse',                     None),
                            ('views',    soup, 'span', 'class', 'watch-view-count',              None),
                            ('extras',   soup, 'ul',   'id',    'watch-description-extra-info', 'li' ),
                            ('tags',     desc, 'ul',   'id',    'eow-tags',                      'a' )):
                    with trapped:

                        name, parent, tag, key, val, multi = row
                        node = parent.find(tag, **{key: val})
                        attr[name] = [render_node(s) for s in node(multi)] if multi else render_node(node)

        return ScanResult(response=response,
                          override_url=None,
                          title=title,
                          content_type=None,
                          content=None,
                          attr=attr)
Example #2
0
    def handle(self, response, log, browser):
        if response.data_type != 'soup':
            raise InvalidContent(response, 'Not an HTML file')
        soup = response.data

        title = None
        with trapped:
            title = strip_site_name(render_node(soup.head.title), response.url)

        with trapped:
            url = soup.head.find('link', rel='image_src')['href']
            response = browser.open(url, follow_meta_redirect=True)
            result = super(IMGurScanner, self).handle(response, log, browser)
            return ScanResult(response=result.response,
                              override_url=result.override_url,
                              title=result.title if title is None else title,
                              content_type=result.content_type,
                              content=result.content,
                              attr=result.attr)

        raise InvalidContent(response, "Couldn't find the image")
Example #3
0
File: html.py Project: Havvy/madcow
    def handle(self, response, log, browser):
        if response.data_type != 'soup':
            raise InvalidContent(response, 'Not an HTML file')
        soup = response.data
        title = summary = content_type = None

        with trapped:
            title = strip_site_name(render_node(soup.head.title), response.url)

        with trapped:
            summary = self.summarize_soup(soup)
            content_type = 'text/plain'

        if title is None and summary is None and content_type is None:
            raise InvalidContent("couldn't get anything useful out of that..")

        return ScanResult(response=response,
                          override_url=None,
                          title=title,
                          content_type=content_type,
                          content=summary,
                          attr=None)
Example #4
0
    def handle(self, response, log, browser):
        if response.data_type != 'soup':
            raise InvalidContent(response, 'Not an HTML file')
        soup = response.data
        title = summary = content_type = None

        with trapped:
            title = strip_site_name(render_node(soup.head.title), response.url)

        with trapped:
            summary = self.summarize_soup(soup)
            content_type = 'text/plain'

        if title is None and summary is None and content_type is None:
            raise InvalidContent("couldn't get anything useful out of that..")

        return ScanResult(response=response,
                          override_url=None,
                          title=title,
                          content_type=content_type,
                          content=summary,
                          attr=None)
Example #5
0
    def handle(self, response, log, browser, video_id):
        attr = {'video_id': video_id}

        title = None
        if response.data_type == 'soup':
            soup = response.data
            with trapped:
                title = strip_site_name(render_node(soup.head.title),
                                        response.url)
            if self.extra_attr:
                desc = soup.find('div', id='watch-description-clip')

                # this describes what we need to scrape.. youtube is awfully structured.
                # NOTE: this is stupidly slow.. disable if doing any volume.
                for row in (('uploader', desc, 'p', 'id',
                             'watch-uploader-info',
                             None), ('summary', desc, 'p', 'id',
                                     'eow-description', None),
                            ('category', desc, 'p', 'id', 'eow-category',
                             None), ('license', desc, 'p', 'id', 'eow-reuse',
                                     None), ('views', soup, 'span', 'class',
                                             'watch-view-count', None),
                            ('extras', soup, 'ul', 'id',
                             'watch-description-extra-info', 'li'),
                            ('tags', desc, 'ul', 'id', 'eow-tags', 'a')):
                    with trapped:

                        name, parent, tag, key, val, multi = row
                        node = parent.find(tag, **{key: val})
                        attr[name] = [render_node(s) for s in node(multi)
                                      ] if multi else render_node(node)

        return ScanResult(response=response,
                          override_url=None,
                          title=title,
                          content_type=None,
                          content=None,
                          attr=attr)