Python register Examples, vidscraper.suites.registry.register Python Examples

Example #1

0

Show file

File: fora.py Project: afrigeo/vidscraper

                elif 'video_src' in tag['rel']:
                    data['flash_enclosure_url'] = unicode(tag['href'])
                elif 'canonical' in tag['rel']:
                    data['link'] = u"http://fora.tv{0}".format(
                                                         unicode(tag['href']))
            elif tag.name == 'span' and tag['id'] == 'program_title_text':
                data['title'] = unicode(tag.string)
            elif tag.name == 'dd' and 'description' in tag['class']:
                data['description'] = ''.join((unicode(t) for t in tag)).strip()
            elif tag.name == 'a' and 'partner_header' in tag['class']:
                data['user'] = unicode(tag.string)
                data['user_url'] = unicode(tag['href'])
            elif tag.name == 'div' and 'information_left' in tag['class']:
                dds = tag.find_all('dd')
                date = unicode(dds[2].string)
                date = datetime.datetime.strptime(date, "%m.%d.%y")
                data['publish_date'] = date
        return data


class Suite(BaseSuite):
    """
    Suite for fora.tv. As of 25-03-2012 fora does not offer any public API,
    only video pages and rss feeds.

    """
    loader_classes = (ScrapeLoader,)


registry.register(Suite)

Example #2

0

Show file

File: blip.py Project: msabramo/vidscraper

        return data

    def get_next_feed_page_url(self, feed, feed_response):
        parsed = urlparse.urlparse(feed_response.href)
        params = urlparse.parse_qs(parsed.query)
        try:
            page = int(params.get('page', ['1'])[0])
        except ValueError:
            page = 1
        params['page'] = unicode(page + 1)
        return "%s?%s" % (urlparse.urlunparse(parsed[:4] + (None, None)),
                          urllib.urlencode(params, True))

    def get_api_url(self, video):
        if '-' not in video.url:
            # http://blip.tv/file/1077145/
            # oh no, an older URL; get the redirected URL
            resp = urllib.urlopen(video.url)
            video.url = resp.geturl()
            resp.close()
        parsed_url = urlparse.urlparse(video.url)
        post_id = parsed_url[2].rsplit('-', 1)[1]
        new_parsed_url = parsed_url[:2] + ("/rss/%s" % post_id,
                                            None, None, None)
        return urlparse.urlunparse(new_parsed_url)

    def parse_api_response(self, response_text):
        parsed = feedparser.parse(response_text)
        return self.parse_feed_entry(parsed.entries[0])
registry.register(BlipSuite)

Example #3

0

Show file

        parsed = urlparse.urlsplit(url)
        if (parsed.scheme in ('http', 'https')
                and parsed.netloc == 'video.google.com'
                and parsed.path == '/videoplay' and 'docid' in parsed.query):
            return {'url': url}
        raise UnhandledVideo(url)

    def get_video_data(self, response):
        soup = BeautifulSoup(response.text).findAll(id=self.id_regex)
        data = {}
        for tag in soup:
            if tag['id'] == 'video-title':
                data['title'] = unicode(tag.string)
            elif tag['id'] == 'video-description':
                data['description'] = ''.join(
                    (unicode(t) for t in tag)).strip()
            elif tag['id'] == 'embed-video-code':
                # This isn't the cleanest way of handling the gt/lt problem,
                # but this is a scrape and liable to break anyway. KISS.
                data['embed_code'] = unicode(tag.string).replace(
                    "&gt;", ">").replace("&lt;", "<")
        return data


class Suite(BaseSuite):
    """Suite for scraping video pages from videos.google.com"""
    loader_classes = (ScrapeLoader, )


registry.register(Suite)

Example #4

0

Show file

File: ustream.py Project: paulswartz/vidscraper

            "user_url",
        ]
    )

    def get_api_url(self, video):
        video_id = self.video_regex.match(video.url).group("id")
        if video.api_keys is None or "ustream_key" not in video.api_keys:
            raise ValueError("API key must be set for Ustream API requests.")
        return "http://api.ustream.tv/json/video/%s/getInfo/?key=%s" % (video_id, video.api_keys["ustream_key"])

    def parse_api_response(self, response_text):
        parsed = json.loads(response_text)["results"]
        url = parsed["embedTagSourceUrl"]
        publish_date = datetime.datetime.strptime(parsed["createdAt"], "%Y-%m-%d %H:%M:%S")
        data = {
            "link": parsed["url"],
            "title": parsed["title"],
            "description": parsed["description"],
            "flash_enclosure_url": url,
            "embed_code": "<iframe src='%s' width='320' height='260' />" % url,
            "thumbnail_url": parsed["imageUrl"]["medium"],
            "publish_date": publish_date,
            "tags": [unicode(tag) for tag in parsed["tags"]],
            "user": parsed["user"]["userName"],
            "user_url": parsed["user"]["url"],
        }
        return data


registry.register(UstreamSuite)

Example #5

0

Show file

File: youtube.py Project: msabramo/vidscraper

    def get_next_page_url_params(self, response):
        start_index = response['feed'].get('opensearch_startindex', None)
        per_page = response['feed'].get('opensearch_itemsperpage', None)
        total_results = response['feed'].get('opensearch_totalresults', None)
        if start_index is None or per_page is None or total_results is None:
            return None
        new_start = int(start_index) + int(per_page)
        if new_start > int(total_results):
            return None
        extra_params = {
            'start-index': new_start,
            'max-results': per_page
        }
        return extra_params

    def get_next_search_page_url(self, search, search_response):
        extra_params = self.get_next_page_url_params(search_response)
        if not extra_params:
            return None
        return self.get_search_url(
            search,
            extra_params=extra_params)

    def get_next_feed_page_url(self, feed, feed_response):
        extra_params = self.get_next_page_url_params(feed_response)
        if not extra_params:
            return None
        return self.get_feed_url(feed.url, extra_params=extra_params)

registry.register(YouTubeSuite)

Example #6

0

Show file

File: vimeo.py Project: paulswartz/vidscraper

        request = client.request(search_url)
        return json.loads(request[1])

    def get_search_total_results(self, search, search_response):
        return int(search_response["videos"]["total"])

    def get_search_results(self, search, search_response):
        return search_response["videos"]["video"]

    def parse_search_result(self, search, result):
        # TODO: results have an embed_privacy key. What is this? Should
        # vidscraper return that information? Doesn't youtube have something
        # similar?
        video_id = result["id"]
        data = {
            "title": result["title"],
            "link": [u["_content"] for u in result["urls"]["url"] if u["type"] == "video"][0],
            "description": result["description"],
            "thumbnail_url": result["thumbnails"]["thumbnail"][1]["_content"],
            "user": result["owner"]["realname"],
            "user_url": result["owner"]["profileurl"],
            "publish_datetime": datetime.strptime(result["upload_date"], "%Y-%m-%d %H:%M:%S"),
            "tags": [t["_content"] for t in result.get("tags", {}).get("tag", [])],
            "flash_enclosure_url": self._flash_enclosure_url_from_id(video_id),
            "embed_code": self._embed_code_from_id(video_id),
        }
        return data


registry.register(VimeoSuite)

Example #7

0

Show file

File: fora.py Project: paulswartz/vidscraper

        for tag in soup:
            if tag.name == "link":
                if tag["rel"] == "image_src":
                    data["thumbnail_url"] = unicode(tag["href"])
                elif tag["rel"] == "video_src":
                    src = unicode(tag["href"])
                    data["flash_enclosure_url"] = src
                    flash_url, flash_vars = src.split("?", 1)
                    flash_vars = urlparse.parse_qs(flash_vars)
                    flash_vars["cliptype"] = "full"
                    flash_vars = urllib.urlencode(flash_vars)
                    data["embed_code"] = make_embed_code(flash_url, flash_vars)
                elif tag["rel"] == "canonical":
                    data["link"] = u"http://fora.tv%s" % unicode(tag["href"])
            elif tag.name == "span" and tag["id"] == "program_title_text":
                data["title"] = unicode(tag.string)
            elif tag.name == "dd" and tag["class"] == "description":
                data["description"] = "".join((unicode(t) for t in tag)).strip()
            elif tag.name == "a" and tag["class"] == "partner_header":
                data["user"] = unicode(tag.string)
                data["user_url"] = unicode(tag["href"])
            elif tag.name == "div" and tag["class"] == "information_left":
                dds = tag.findAll("dd")
                date = unicode(dds[2].string)
                date = datetime.datetime.strptime(date, "%m.%d.%y")
                data["publish_date"] = date
        return data


registry.register(ForaSuite)

Example #8

0

Show file

File: google.py Project: paulswartz/vidscraper


ID_REGEX = re.compile(r"video-title|video-description|embed-video-code")


class GoogleSuite(BaseSuite):
    """Suite for scraping video pages from videos.google.com"""

    video_regex = r"^https?://video.google.com/videoplay"
    scrape_fields = set(["title", "description", "embed_code"])

    def get_scrape_url(self, video):
        return video.url

    def parse_scrape_response(self, response_text):
        soup = BeautifulSoup(response_text).findAll(attrs={"id": ID_REGEX})
        data = {}
        for tag in soup:
            if tag["id"] == "video-title":
                data["title"] = unicode(tag.string)
            elif tag["id"] == "video-description":
                data["description"] = "".join((unicode(t) for t in tag)).strip()
            elif tag["id"] == "embed-video-code":
                # This isn't the cleanest way of handling the gt/lt problem,
                # but this is a scrape and liable to break anyway. KISS.
                data["embed_code"] = unicode(tag.string).replace("&gt;", ">").replace("&lt;", "<")
        return data


registry.register(GoogleSuite)