Ejemplo n.º 1
0
 def parse_scrape_response(self, response_text):
     strainer = SoupStrainer(_strain_filter)
     soup = BeautifulSoup(response_text, parseOnlyThese=strainer)
     data = {}
     for tag in soup:
         if tag.name == "link":
             if tag["rel"] == "image_src":
                 data["thumbnail_url"] = unicode(tag["href"])
             elif tag["rel"] == "video_src":
                 src = unicode(tag["href"])
                 data["flash_enclosure_url"] = src
                 flash_url, flash_vars = src.split("?", 1)
                 flash_vars = urlparse.parse_qs(flash_vars)
                 flash_vars["cliptype"] = "full"
                 flash_vars = urllib.urlencode(flash_vars)
                 data["embed_code"] = make_embed_code(flash_url, flash_vars)
             elif tag["rel"] == "canonical":
                 data["link"] = u"http://fora.tv%s" % unicode(tag["href"])
         elif tag.name == "span" and tag["id"] == "program_title_text":
             data["title"] = unicode(tag.string)
         elif tag.name == "dd" and tag["class"] == "description":
             data["description"] = "".join((unicode(t) for t in tag)).strip()
         elif tag.name == "a" and tag["class"] == "partner_header":
             data["user"] = unicode(tag.string)
             data["user_url"] = unicode(tag["href"])
         elif tag.name == "div" and tag["class"] == "information_left":
             dds = tag.findAll("dd")
             date = unicode(dds[2].string)
             date = datetime.datetime.strptime(date, "%m.%d.%y")
             data["publish_date"] = date
     return data
Ejemplo n.º 2
0
    def parse_feed_entry(self, entry):
        enclosure = get_first_accepted_enclosure(entry)
        if "published_parsed" in entry:
            best_date = struct_time_to_datetime(entry["published_parsed"])
        elif "updated_parsed" in entry:
            best_date = struct_time_to_datetime(entry["updated_parsed"])
        else:
            best_date = None

        link = entry.get("link")
        if "links" in entry:
            for possible_link in entry.links:
                if possible_link.get("rel") == "via":
                    # original URL
                    link = possible_link["href"]
                    break
        if "content" in entry and entry["content"] and entry["content"][0]["value"]:  # Atom
            description = entry["content"][0]["value"]
        else:
            description = entry.get("summary", "")

        embed_code = None
        if "media_player" in entry:
            player = entry["media_player"]
            if player.get("content"):
                embed_code = convert_entities(player["content"])
            elif "url" in player:
                embed_code = make_embed_code(player["url"], "")
        if "media_license" in entry:
            license = entry["media_license"]["href"]
        else:
            license = entry.get("license")
        return {
            "link": link,
            "title": convert_entities(entry["title"]),
            "description": description,
            "thumbnail_url": get_entry_thumbnail_url(entry),
            "file_url": enclosure.get("url") if enclosure else None,
            "file_url_mimetype": enclosure.get("type") if enclosure else None,
            "file_url_length": ((enclosure.get("filesize") or enclosure.get("length")) if enclosure else None),
            "publish_datetime": best_date,
            "guid": entry.get("id"),
            "embed_code": embed_code,
            "tags": [tag["term"] for tag in entry["tags"] if tag["scheme"] is None] if "tags" in entry else None,
            "license": license,
        }
Ejemplo n.º 3
0
    def parse_feed_entry(self, entry):
        enclosure = get_first_accepted_enclosure(entry)
        if 'published_parsed' in entry:
            best_date = struct_time_to_datetime(entry['published_parsed'])
        elif 'updated_parsed' in entry:
            best_date = struct_time_to_datetime(entry['updated_parsed'])
        else:
            best_date = None

        link = entry.get('link')
        if 'links' in entry:
            for possible_link in entry.links:
                if possible_link.get('rel') == 'via':
                    # original URL
                    link = possible_link['href']
                    break
        if ('content' in entry and entry['content'] and
            entry['content'][0]['value']): # Atom
            description = entry['content'][0]['value']
        else:
            description = entry['summary'] or ''

        embed_code = None
        if 'media_player' in entry:
            player = entry['media_player']
            if player.get('content'):
                embed_code = convert_entities(player['content'])
            elif 'url' in player:
                embed_code = make_embed_code(player['url'], '')

        return {
            'link': link,
            'title': convert_entities(entry['title']),
            'description': description,
            'thumbnail_url': get_entry_thumbnail_url(entry),
            'file_url': enclosure.get('url') if enclosure else None,
            'file_url_mimetype': enclosure.get('type') if enclosure else None,
            'file_url_length': ((enclosure.get('filesize') or
                                enclosure.get('length'))
                                if enclosure else None),
            'publish_datetime': best_date,
            'guid': entry.get('id'),
            'embed_code': embed_code,
            'tags': [tag['term'] for tag in entry['tags']
                     if tag['scheme'] is None] if 'tags' in entry else None
            }