Esempio n. 1
0
    def __init__(self,
                 query,
                 random_agent=False,
                 debug=False,
                 lang="en",
                 tld="com",
                 re_search_strings=None,
                 timeout=5):
        self.query = query
        self.debug = debug
        self.browser = Browser(debug=debug, timeout=timeout)
        self.results_info = None
        self.eor = False  # end of results
        self._page = 0
        self._first_indexed_in_previous = None
        self._filetype = None
        self._last_search_url = None
        self._results_per_page = 10
        self._last_from = 0
        self._lang = lang
        self._tld = tld

        if re_search_strings:
            self._re_search_strings = re_search_strings
        elif lang == "de":
            self._re_search_strings = ("Ergebnisse", "von", "ungefähr")
        elif lang == "es":
            self._re_search_strings = ("Resultados", "de", "aproximadamente")
        # add more localised versions here
        else:
            self._re_search_strings = ("Results", "of", "about")

        if random_agent:
            self.browser.set_random_user_agent()
Esempio n. 2
0
File: kq.py Progetto: caozhzh/rss
#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
os.chdir('/home/caozhzh/work/rss')

import re
import urllib
from xgoogle.BeautifulSoup import BeautifulSoup
from xgoogle.browser import Browser, BrowserError
from xgoogle.GeneralFetch import GeneralFetch

url = "http://blog.sina.com.cn/u/1696709200"
b = Browser()
page = b.get_page(url)
page = page.replace('<!–[if lte IE 6]>', '')
page = page.replace('<![endif]–>', '')
#print page

be = BeautifulSoup(page)
div = be.find('div', {'class': 'diywidget'})
txt = ''.join(div.findAll(text=True))
#print type(txt)

import feedparser
origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml')

from feedformatter import Feed
import time
import datetime
import uuid
Esempio n. 3
0
def update_from_web(model, film, year):
    search = "kinopoisk.ru " + year + " " + film
    print "Search: %s" % search
    browser = Browser(debug=True)
    gs = GoogleSearch(search)
    gs.results_per_page = 1
    results = gs.get_results()
    try:
        for res in results:
            pageurl = res.url.encode('utf8')
            page = browser.get_page(pageurl)
            soup = BeautifulStoneSoup(
                page[page.find("<html"):],
                convertEntities=BeautifulStoneSoup.HTML_ENTITIES,
                fromEncoding="windows-1251")
            print "URL: %s" % pageurl
            rating = soup.find('a', attrs={'class': 'continue'})
            if rating:
                r = strip(rating).split(' ')
                try:
                    model.rating = float(r[1])
                    print "Rating: %s" % r[1]
                except Exception, ex:
                    model.rating = 0.0
                    print "Can't parse rating"

            title = soup.find('h1', 'moviename-big')
            if title:
                print "Title: %s" % strip(title)
                model.title = strip(title)

            info = soup.find('span', '_reachbanner_')
            if info:
                print "Info: %s" % strip(info)
                model.description = strip(info)

            img = soup.find('img', attrs={"width": "120"})
            if img:
                print "Image: %s" % img['src']
                model.image = "http://www.kinopoisk.ru%s" % img['src']


#getTrailer("t26538","397494/kinopoisk.ru-District-9-36971.mp4","397494/1_36971.jpg","480","270","tr","","");

            import re
            m = re.search("getTrailer\((.*)\)", str(soup))
            if not m:
                pass
            else:
                parts = m.group(1).split('"')
                url = "http://tr.kinopoisk.ru/%s" % parts[3]
                model.trailer = url
                image = "http://tr.kinopoisk.ru/%s" % parts[5]
                model.trailer_image = image
                print "Trailer: %s" % url
                print "TrailerImage: %s" % image

            break

    except Exception, e:
        print "WARNING: %s" % e