def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): self.query = query self.debug = debug self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent()
#!/usr/bin/python #-*- coding: utf-8 -*- import os os.chdir('/home/caozhzh/work/rss') import re import urllib from xgoogle.BeautifulSoup import BeautifulSoup from xgoogle.browser import Browser, BrowserError from xgoogle.GeneralFetch import GeneralFetch url = "http://blog.sina.com.cn/u/1696709200" b = Browser() page = b.get_page(url) page = page.replace('<!–[if lte IE 6]>', '') page = page.replace('<![endif]–>', '') #print page be = BeautifulSoup(page) div = be.find('div', {'class': 'diywidget'}) txt = ''.join(div.findAll(text=True)) #print type(txt) import feedparser origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml') from feedformatter import Feed import time import datetime import uuid
def update_from_web(model, film, year): search = "kinopoisk.ru " + year + " " + film print "Search: %s" % search browser = Browser(debug=True) gs = GoogleSearch(search) gs.results_per_page = 1 results = gs.get_results() try: for res in results: pageurl = res.url.encode('utf8') page = browser.get_page(pageurl) soup = BeautifulStoneSoup( page[page.find("<html"):], convertEntities=BeautifulStoneSoup.HTML_ENTITIES, fromEncoding="windows-1251") print "URL: %s" % pageurl rating = soup.find('a', attrs={'class': 'continue'}) if rating: r = strip(rating).split(' ') try: model.rating = float(r[1]) print "Rating: %s" % r[1] except Exception, ex: model.rating = 0.0 print "Can't parse rating" title = soup.find('h1', 'moviename-big') if title: print "Title: %s" % strip(title) model.title = strip(title) info = soup.find('span', '_reachbanner_') if info: print "Info: %s" % strip(info) model.description = strip(info) img = soup.find('img', attrs={"width": "120"}) if img: print "Image: %s" % img['src'] model.image = "http://www.kinopoisk.ru%s" % img['src'] #getTrailer("t26538","397494/kinopoisk.ru-District-9-36971.mp4","397494/1_36971.jpg","480","270","tr","",""); import re m = re.search("getTrailer\((.*)\)", str(soup)) if not m: pass else: parts = m.group(1).split('"') url = "http://tr.kinopoisk.ru/%s" % parts[3] model.trailer = url image = "http://tr.kinopoisk.ru/%s" % parts[5] model.trailer_image = image print "Trailer: %s" % url print "TrailerImage: %s" % image break except Exception, e: print "WARNING: %s" % e