def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): self.query = query self.debug = debug self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent()
def update_from_web( model, film, year ): search = "kinopoisk.ru " + year + " " + film print "Search: %s" % search browser=Browser(debug=True) gs = GoogleSearch(search) gs.results_per_page = 1 results = gs.get_results() try: for res in results: pageurl = res.url.encode('utf8') page = browser.get_page( pageurl ) soup = BeautifulStoneSoup( page[ page.find("<html"):], convertEntities=BeautifulStoneSoup.HTML_ENTITIES, fromEncoding="windows-1251" ) print "URL: %s" % pageurl rating = soup.find('a',attrs={'class':'continue'}) if rating: r = strip(rating).split(' ') try: model.rating = float( r[1] ) print "Rating: %s" % r[1] except Exception, ex: model.rating = 0.0 print "Can't parse rating" title = soup.find('h1','moviename-big') if title: print "Title: %s" % strip(title) model.title = strip(title) info = soup.find('span','_reachbanner_') if info: print "Info: %s" % strip(info) model.description = strip( info ) img = soup.find('img', attrs={"width" : "120"}) if img: print "Image: %s" % img['src'] model.image = "http://www.kinopoisk.ru%s" % img['src'] #getTrailer("t26538","397494/kinopoisk.ru-District-9-36971.mp4","397494/1_36971.jpg","480","270","tr","",""); import re m = re.search("getTrailer\((.*)\)",str(soup)) if not m: pass else: parts = m.group(1).split('"') url = "http://tr.kinopoisk.ru/%s" % parts[3] model.trailer = url image = "http://tr.kinopoisk.ru/%s" % parts[5] model.trailer_image = image print "Trailer: %s" % url print "TrailerImage: %s" % image break except Exception,e: print "WARNING: %s" % e
def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") elif lang == "fr": self._re_search_strings = ("résultats", "de", "Environ") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent()
class GoogleImageSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s" NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&start=%(start)d" SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5): self.query = query self.debug = debug self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent() @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def last_search_url(self): return self._last_search_url def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_first_indexed_in_previous(self): return self._first_indexed_in_previous def _set_first_indexed_in_previous(self, interval): if interval == "day": self._first_indexed_in_previous = 'd' elif interval == "week": self._first_indexed_in_previous = 'w' elif interval == "month": self._first_indexed_in_previous = 'm' elif interval == "year": self._first_indexed_in_previous = 'y' else: # a floating point value is a number of months try: num = float(interval) except ValueError: raise SearchError( "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = 'm' + str(interval) first_indexed_in_previous = property( _get_first_indexed_in_previous, _set_first_indexed_in_previous, doc= "possible values: day, week, month, year, or a float value of months") def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] MAX_VALUE = 1000000 page = self._get_results_page() results = self._extract_results(page) search_info = { 'from': self.results_per_page * self._page, 'to': self.results_per_page * self._page + len(results), 'total': MAX_VALUE } if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_results_page(self): if self._page == 0: if self._results_per_page == 10: url = GoogleImageSearch.SEARCH_URL_0 else: url = GoogleImageSearch.SEARCH_URL_1 else: if self._results_per_page == 10: url = GoogleImageSearch.NEXT_PAGE_0 else: url = GoogleImageSearch.NEXT_PAGE_1 safe_url = [ url % { 'query': urllib.parse.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page, 'tld': self._tld, 'lang': self._lang } ] # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) safe_url = "".join(safe_url) self._last_search_url = safe_url try: page = self.browser.get_page(safe_url) except BrowserError as e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page) def _extract_info(self, soup): empty_info = {'from': 0, 'to': 0, 'total': 0} div_ssb = soup.find('div', id='ssb') if not div_ssb: self._maybe_raise( ParseError, "Div with number of results was not found on Google search page", soup) return empty_info p = div_ssb.find('p') if not p: self._maybe_raise( ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup) return empty_info txt = ''.join(p.findAll(text=True)) txt = txt.replace(',', '') matches = re.search( r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) if not matches: return empty_info return { 'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3)) } def _extract_results(self, soup): # Should extract <a href="/url?q= results = soup.findAll('a', href=re.compile("^/url\?q=")) #results = soup.findAll('img') ret_res = [] for result in results: eres = self._extract_result(result) if eres: ret_res.append(eres) return ret_res def _extract_result(self, result): imgsa = result.findAll('img') if len(imgsa) == 0: return FaceImageSearchResult("", "") else: imgs = imgsa[0] trumnail = imgs['src'] image = result['href'][7:] return FaceImageSearchResult(trumnail, image) def _extract_title_url(self, result): #title_a = result.find('a', {'class': re.compile(r'\bl\b')}) title_a = result.find('a') if not title_a: self._maybe_raise( ParseError, "Title tag in Google search result was not found", result) return None, None title = ''.join(title_a.findAll(text=True)) title = self._html_unescape(title) url = title_a['href'] match = re.match(r'/url\?q=(http[^&]+)&', url) if match: url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): desc_div = result.find('div', {'class': re.compile(r'\bs\b')}) if not desc_div: self._maybe_raise( ParseError, "Description tag in Google search result was not found", result) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == 'br': break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_div) looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr> desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc) def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return chr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return chr(cp) else: return m.group(0) s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
results.extend(temp_res) print "Got %d, sleeping 5 seconds and getting more." % len(results) sleep(5) print "Got %d results." % len(results) ignore_ids = set() seen_ids = set() def get_id(url): # http://www.plurk.com/p/d8cyp/Good-Night-Plurkland m = re.search(url, "/p/([a-z0-9]+)") if m: return m.group(1) return None browser = Browser() for nr, result in enumerate(results): nr = nr + 1 try: page = browser.get_page(result.url) if page.find('href="/user/%s" class="user"' % username) != -1: id = get_id(result.url) if id in ignore_ids: continue if id not in seen_ids: print "%s (%d of %d)" % (result.url, nr, len(results)) seen_ids.add(id) except BrowserError, error: if "404" not in error.error: print "%s (%s)" % (error.error, error.url)
#!/usr/bin/python #-*- coding: utf-8 -*- import os os.chdir('/home/caozhzh/work/rss') import re import urllib from xgoogle.BeautifulSoup import BeautifulSoup from xgoogle.browser import Browser, BrowserError from xgoogle.GeneralFetch import GeneralFetch url = "http://blog.sina.com.cn/u/1696709200" b = Browser() page = b.get_page(url) page = page.replace('<!–[if lte IE 6]>', '') page = page.replace('<![endif]–>', '') #print page be = BeautifulSoup(page) div = be.find('div', {'class': 'diywidget'}) txt = ''.join(div.findAll(text=True)) #print type(txt) import feedparser origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml') from feedformatter import Feed import time import datetime import uuid
class GoogleFaceImageSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s" NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&start=%(start)d" SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=isch&tbs=itp:face&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") elif lang == "fr": self._re_search_strings = ("résultats", "de", "Environ") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent() @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def last_search_url(self): return self._last_search_url def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_first_indexed_in_previous(self): return self._first_indexed_in_previous def _set_first_indexed_in_previous(self, interval): if interval == "day": self._first_indexed_in_previous = 'd' elif interval == "week": self._first_indexed_in_previous = 'w' elif interval == "month": self._first_indexed_in_previous = 'm' elif interval == "year": self._first_indexed_in_previous = 'y' else: # a floating point value is a number of months try: num = float(interval) except ValueError: raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = 'm' + str(interval) first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] MAX_VALUE = 1000000 page = self._get_results_page() results = self._extract_results(page) search_info = {'from': self.results_per_page*self._page, 'to': self.results_per_page*self._page + len(results), 'total': MAX_VALUE} if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_results_page(self): if self._page == 0: if self._results_per_page == 10: url = GoogleFaceImageSearch.SEARCH_URL_0 else: url = GoogleFaceImageSearch.SEARCH_URL_1 else: if self._results_per_page == 10: url = GoogleFaceImageSearch.NEXT_PAGE_0 else: url = GoogleFaceImageSearch.NEXT_PAGE_1 safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page, 'tld' : self._tld, 'lang' : self._lang }] # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) safe_url = "".join(safe_url) self._last_search_url = safe_url try: page = self.browser.get_page(safe_url) except BrowserError as e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page) def _extract_info(self, soup): empty_info = {'from': 0, 'to': 0, 'total': 0} div_ssb = soup.find('div', id='resultStats') if not div_ssb: self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) return empty_info #p = div_ssb.find('p') p = div_ssb if not p: self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup) return empty_info txt = ''.join(p.findAll(text=True)) txt = txt.replace(',', '') txt = txt.replace(' ', '') #matches = re.search(r'(\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) #matches = re.search(r'(\d+) %s' % self._re_search_strings[0], txt, re.U|re.I) matches = re.search(r'(\d+)', txt, re.U) if not matches: print(self._re_search_strings[0]) print(txt) return empty_info return {'from': 0, 'to': 0, 'total': int(matches.group(1))} def _extract_results(self, soup): # Should extract <a href="/url?q= results = soup.findAll('a', href=re.compile("^/url\?q=")) #results = soup.findAll('img') ret_res = [] for result in results: eres = self._extract_result(result) if eres: ret_res.append(eres) return ret_res def _extract_result(self, result): imgsa = result.findAll('img') if len(imgsa)==0: return FaceImageSearchResult("","") else: imgs = imgsa[0] trumnail = imgs['src'] image = result['href'][7:] return FaceImageSearchResult(trumnail, image) def _extract_title_url(self, result): title_a = result.find('a', {'class': 'l'}) if not title_a: self._maybe_raise(ParseError, "Title tag in Google search result was not found", result) return None, None title = ''.join(title_a.findAll(text=True)) title = self._html_unescape(title) url = title_a['href'] match = re.match(r'/url\?q=((http|ftp|https)[^&]+)&', url) if match: url = urllib.parse.unquote(match.group(1)) match = re.match(r'/interstitial\?url=((http|ftp|https)[^&]+)&', url) if match: url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): desc_div = result.find('span', {'class': 'st'}) if not desc_div: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None desc_span = desc_div.find('span', {'class': 'st'}) if not desc_span: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == 'br': break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_span) looper(desc_span.find('wbr')) # BeautifulSoup does not self-close <wbr> desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc) def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return unichr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return unichr(cp) else: return m.group(0) s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
class GoogleVideoSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s" NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&start=%(start)d" SEARCH_URL_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None): self.query = query self.debug = debug self.browser = Browser(debug=debug) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent() @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info['total'] == 0: self.eor = True return self.results_info['total'] @property def last_search_url(self): return self._last_search_url def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_first_indexed_in_previous(self): return self._first_indexed_in_previous def _set_first_indexed_in_previous(self, interval): if interval == "day": self._first_indexed_in_previous = 'd' elif interval == "week": self._first_indexed_in_previous = 'w' elif interval == "month": self._first_indexed_in_previous = 'm' elif interval == "year": self._first_indexed_in_previous = 'y' else: # a floating point value is a number of months try: num = float(interval) except ValueError: raise SearchError("Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = 'm' + str(interval) first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months") def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] MAX_VALUE = 1000000 page = self._get_results_page() results = self._extract_results(page) search_info = {'from': self.results_per_page*self._page, 'to': self.results_per_page*self._page + len(results), 'total': MAX_VALUE} if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] if not results: self.eor = True return [] if self._page > 0 and search_info['from'] == self._last_from: self.eor = True return [] if search_info['to'] == search_info['total']: self.eor = True self._page += 1 self._last_from = search_info['from'] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_results_page(self): if self._page == 0: if self._results_per_page == 10: url = GoogleVideoSearch.SEARCH_URL_0 else: url = GoogleVideoSearch.SEARCH_URL_1 else: if self._results_per_page == 10: url = GoogleVideoSearch.NEXT_PAGE_0 else: url = GoogleVideoSearch.NEXT_PAGE_1 safe_url = [url % { 'query': urllib.parse.quote_plus(self.query), 'start': self._page * self._results_per_page, 'num': self._results_per_page, 'tld' : self._tld, 'lang' : self._lang }] # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) safe_url = "".join(safe_url) self._last_search_url = safe_url try: page = self.browser.get_page(safe_url) except BrowserError as e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page) def _extract_info(self, soup): empty_info = {'from': 0, 'to': 0, 'total': 0} div_ssb = soup.find('div', id='ssb') if not div_ssb: self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) return empty_info p = div_ssb.find('p') if not p: self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup) return empty_info txt = ''.join(p.findAll(text=True)) txt = txt.replace(',', '') matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) if not matches: return empty_info return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))} def _extract_results(self, soup): results = soup.findAll('li', {"class" : re.compile(r'\b(g videobox|g)\b')}) ret_res = [] for result in results: eres = self._extract_result(result) if eres: ret_res.append(eres) return ret_res def _extract_result(self, result): h3=result.findAll('h3') name = '' for lonuri in h3: name = name + str(lonuri) videoname = nltk.clean_html(str(name)) video_url = BeautifulSoup(str(h3)).findAll('a') url = str.split(video_url[0]['href'][7:].encode('utf8'),'&')[0] url = url.replace('%3F', '?') url = url.replace('%3D', '=') desc = result.find('span',{'class':'st'}) meta = result.find('span',{'class':'f'}) author = '' duration = '' date = '' if(not meta== None): metastr= nltk.clean_html(str(meta)) metaarr = metastr.split('-') date = metaarr[0] duration = metaarr[1] author = metaarr[2][13:] description = nltk.clean_html(str(desc)) return FaceVideoSearchResult(videoname,url,description,date,duration,author) #return FaceImageSearchResult(trumnail, image) def _extract_title_url(self, result): #title_a = result.find('a', {'class': re.compile(r'\bl\b')}) title_a = result.find('a') if not title_a: self._maybe_raise(ParseError, "Title tag in Google search result was not found", result) return None, None title = ''.join(title_a.findAll(text=True)) title = self._html_unescape(title) url = title_a['href'] match = re.match(r'/url\?q=(http[^&]+)&', url) if match: url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): desc_div = result.find('div', {'class': re.compile(r'\bs\b')}) if not desc_div: self._maybe_raise(ParseError, "Description tag in Google search result was not found", result) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == 'br': break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_div) looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr> desc = ''.join(s for s in desc_strs if s) return self._html_unescape(desc) def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return unichr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return unichr(cp) else: return m.group(0) s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
class GoogleVideoSearch(object): SEARCH_URL_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s" NEXT_PAGE_0 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&start=%(start)d" SEARCH_URL_1 = ( "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d" ) NEXT_PAGE_1 = "http://www.google.%(tld)s/search?tbm=vid&hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" def __init__( self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None, timeout=5, ): self.query = query self.debug = debug self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent() @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info["total"] == 0: self.eor = True return self.results_info["total"] @property def last_search_url(self): return self._last_search_url def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_first_indexed_in_previous(self): return self._first_indexed_in_previous def _set_first_indexed_in_previous(self, interval): if interval == "day": self._first_indexed_in_previous = "d" elif interval == "week": self._first_indexed_in_previous = "w" elif interval == "month": self._first_indexed_in_previous = "m" elif interval == "year": self._first_indexed_in_previous = "y" else: # a floating point value is a number of months try: num = float(interval) except ValueError: raise SearchError( "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = "m" + str(interval) first_indexed_in_previous = property( _get_first_indexed_in_previous, _set_first_indexed_in_previous, doc= "possible values: day, week, month, year, or a float value of months", ) def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] MAX_VALUE = 1000000 page = self._get_results_page() results = self._extract_results(page) search_info = { "from": self.results_per_page * self._page, "to": self.results_per_page * self._page + len(results), "total": MAX_VALUE, } if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] if not results: self.eor = True return [] if self._page > 0 and search_info["from"] == self._last_from: self.eor = True return [] if search_info["to"] == search_info["total"]: self.eor = True self._page += 1 self._last_from = search_info["from"] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_results_page(self): if self._page == 0: if self._results_per_page == 10: url = GoogleVideoSearch.SEARCH_URL_0 else: url = GoogleVideoSearch.SEARCH_URL_1 else: if self._results_per_page == 10: url = GoogleVideoSearch.NEXT_PAGE_0 else: url = GoogleVideoSearch.NEXT_PAGE_1 safe_url = [ url % { "query": urllib.parse.quote_plus(self.query), "start": self._page * self._results_per_page, "num": self._results_per_page, "tld": self._tld, "lang": self._lang, } ] # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) safe_url = "".join(safe_url) self._last_search_url = safe_url try: page = self.browser.get_page(safe_url) except BrowserError as e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page) def _extract_info(self, soup): empty_info = {"from": 0, "to": 0, "total": 0} div_ssb = soup.find("div", id="ssb") if not div_ssb: self._maybe_raise( ParseError, "Div with number of results was not found on Google search page", soup, ) return empty_info p = div_ssb.find("p") if not p: self._maybe_raise( ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup, ) return empty_info txt = "".join(p.findAll(text=True)) txt = txt.replace(",", "") matches = re.search( r"%s (\d+) - (\d+) %s (?:%s )?(\d+)" % self._re_search_strings, txt, re.U) if not matches: return empty_info return { "from": int(matches.group(1)), "to": int(matches.group(2)), "total": int(matches.group(3)), } def _extract_results(self, soup): results = soup.findAll("li", {"class": re.compile(r"\b(g videobox|g)\b")}) ret_res = [] for result in results: eres = self._extract_result(result) if eres: ret_res.append(eres) return ret_res def _extract_result(self, result): h3 = result.findAll("h3") name = "" for lonuri in h3: name = name + str(lonuri) videoname = nltk.clean_html(str(name)) video_url = BeautifulSoup(str(h3)).findAll("a") url = str.split(video_url[0]["href"][7:].encode("utf8"), "&")[0] url = url.replace("%3F", "?") url = url.replace("%3D", "=") desc = result.find("span", {"class": "st"}) meta = result.find("span", {"class": "f"}) author = "" duration = "" date = "" if not meta == None: metastr = nltk.clean_html(str(meta)) metaarr = metastr.split("-") date = metaarr[0] duration = metaarr[1] author = metaarr[2][13:] description = nltk.clean_html(str(desc)) return FaceVideoSearchResult(videoname, url, description, date, duration, author) # return FaceImageSearchResult(trumnail, image) def _extract_title_url(self, result): # title_a = result.find('a', {'class': re.compile(r'\bl\b')}) title_a = result.find("a") if not title_a: self._maybe_raise( ParseError, "Title tag in Google search result was not found", result) return None, None title = "".join(title_a.findAll(text=True)) title = self._html_unescape(title) url = title_a["href"] match = re.match(r"/url\?q=(http[^&]+)&", url) if match: url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): desc_div = result.find("div", {"class": re.compile(r"\bs\b")}) if not desc_div: self._maybe_raise( ParseError, "Description tag in Google search result was not found", result, ) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == "br": break except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_div) looper(desc_div.find("wbr")) # BeautifulSoup does not self-close <wbr> desc = "".join(s for s in desc_strs if s) return self._html_unescape(desc) def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return chr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return chr(cp) else: return m.group(0) s = re.sub(r"&#(\d+);", ascii_replacer, str, re.U) return re.sub(r"&([^;]+);", entity_replacer, s, re.U)
class GoogleSearch(object): SEARCH_URL_0 = ( "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search" ) NEXT_PAGE_0 = ( "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d" ) SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search" NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d" def __init__( self, query, random_agent=True, debug=False, lang="en", tld="com", re_search_strings=None, repeat=None, timeout=5, ): self.query = query self.debug = debug self.browser = Browser(debug=debug, timeout=timeout) self.results_info = None self.eor = False # end of results self._page = 0 self._first_indexed_in_previous = None self._filetype = None self._last_search_url = None self._results_per_page = 10 self._last_from = 0 self._lang = lang self._tld = tld self.repeat = repeat if re_search_strings: self._re_search_strings = re_search_strings elif lang == "de": self._re_search_strings = ("Ergebnisse", "von", "ungefähr") elif lang == "es": self._re_search_strings = ("Resultados", "de", "aproximadamente") # add more localised versions here else: self._re_search_strings = ("Results", "of", "about") if random_agent: self.browser.set_random_user_agent() @property def num_results(self): if not self.results_info: page = self._get_results_page() self.results_info = self._extract_info(page) if self.results_info["total"] == 0: self.eor = True return self.results_info["total"] @property def last_search_url(self): return self._last_search_url def _get_page(self): return self._page def _set_page(self, page): self._page = page page = property(_get_page, _set_page) def _get_first_indexed_in_previous(self): return self._first_indexed_in_previous def _set_first_indexed_in_previous(self, interval): if interval == "day": self._first_indexed_in_previous = "d" elif interval == "week": self._first_indexed_in_previous = "w" elif interval == "month": self._first_indexed_in_previous = "m" elif interval == "year": self._first_indexed_in_previous = "y" else: # a floating point value is a number of months try: num = float(interval) except ValueError: raise SearchError( "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))) self._first_indexed_in_previous = "m" + str(interval) first_indexed_in_previous = property( _get_first_indexed_in_previous, _set_first_indexed_in_previous, doc= "possible values: day, week, month, year, or a float value of months", ) def _get_filetype(self): return self._filetype def _set_filetype(self, filetype): self._filetype = filetype filetype = property(_get_filetype, _set_filetype, doc="file extension to search for") def _get_results_per_page(self): return self._results_per_page def _set_results_par_page(self, rpp): self._results_per_page = rpp results_per_page = property(_get_results_per_page, _set_results_par_page) def get_results(self): """ Gets a page of results """ if self.eor: return [] MAX_VALUE = 1000000 page = self._get_results_page() # search_info = self._extract_info(page) results = self._extract_results(page) search_info = { "from": self.results_per_page * self._page, "to": self.results_per_page * self._page + len(results), "total": MAX_VALUE, } if not self.results_info: self.results_info = search_info if self.num_results == 0: self.eor = True return [] if not results: self.eor = True return [] if self._page > 0 and search_info["from"] == self._last_from: self.eor = True return [] if search_info["to"] == search_info["total"]: self.eor = True self._page += 1 self._last_from = search_info["from"] return results def _maybe_raise(self, cls, *arg): if self.debug: raise cls(*arg) def _get_results_page(self): """Construct search url, and get the page content""" if self._page == 0: if self._results_per_page == 10: url = GoogleSearch.SEARCH_URL_0 else: url = GoogleSearch.SEARCH_URL_1 else: if self._results_per_page == 10: url = GoogleSearch.NEXT_PAGE_0 else: url = GoogleSearch.NEXT_PAGE_1 safe_url = [ url % { "query": urllib.parse.quote_plus(self.query), "start": self._page * self._results_per_page, "num": self._results_per_page, "tld": self._tld, "lang": self._lang, } ] # possibly extend url with optional properties if self._first_indexed_in_previous: safe_url.extend(["&as_qdr=", self._first_indexed_in_previous]) if self._filetype: safe_url.extend(["&as_filetype=", self._filetype]) if self.repeat: safe_url.extend(["&filter=", "0"]) safe_url = "".join(safe_url) self._last_search_url = safe_url try: page = self.browser.get_page(safe_url) except BrowserError as e: raise SearchError("Failed getting %s: %s" % (e.url, e.error)) return BeautifulSoup(page, "html.parser") def _extract_info(self, soup): """Extract total results Page X of about XXX results """ empty_info = {"from": 0, "to": 0, "total": 0} div_ssb = soup.find("div", id="ssb") if not div_ssb: self._maybe_raise( ParseError, "Div with number of results was not found on Google search page", soup, ) return empty_info p = div_ssb.find("p") if not p: self._maybe_raise( ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup, ) return empty_info txt = "".join(p.findAll(text=True)) txt = txt.replace(",", "") matches = re.search( r"%s (\d+) - (\d+) %s (?:%s )?(\d+)" % self._re_search_strings, txt, re.U) if not matches: return empty_info return { "from": int(matches.group(1)), "to": int(matches.group(2)), "total": int(matches.group(3)), } def _extract_results(self, soup): """Extract results from the page""" results = soup.findAll(["div", "g"]) ret_res = [] for result in results: eres = self._extract_result(result) if eres: ret_res.append(eres) return ret_res def _extract_result(self, result): """Extract title,url,desc for a result""" title, url = self._extract_title_url(result) desc = self._extract_description(result) if not title and not url: return None return SearchResult(title, url, desc) def _extract_title_url(self, result): # title_a = result.find('a', {'class': re.compile(r'\bl\b')}) # title_a = result.find('h3').find('a') title_a = result.find("a") if not title_a: self._maybe_raise( ParseError, "Title tag in Google search result was not found", result) return None, None title = "".join(title_a.findAll(text=True)) title = self._html_unescape(title) url = title_a["href"] match = re.match(r"/url\?q=((http|ftp|https)[^&]+)&", url) if match: url = urllib.parse.unquote(match.group(1)) match = re.match(r"/interstitial\?url=((http|ftp|https)[^&]+)&", url) if match: url = urllib.parse.unquote(match.group(1)) return title, url def _extract_description(self, result): """Seems this is enough""" desc = result.find("span", {"class": "st"}) return desc desc_div = result.find("span", "st") if not desc_div: self._maybe_raise( ParseError, "Description tag in Google search result was not found", result, ) return None desc_span = desc_div.find("span", {"class": "st"}) if not desc_span: self._maybe_raise( ParseError, "Description tag in Google search result was not found", result, ) return None desc_strs = [] def looper(tag): if not tag: return for t in tag: try: if t.name == "br": continue except AttributeError: pass try: desc_strs.append(t.string) except AttributeError: desc_strs.append(t) looper(desc_span) looper( desc_span.find("wbr")) # BeautifulSoup does not self-close <wbr> desc = "".join(s for s in desc_strs if s) return self._html_unescape(desc) def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return chr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return chr(cp) else: return m.group(0) s = re.sub(r"&#(\d+);", ascii_replacer, str, re.U) return re.sub(r"&([^;]+);", entity_replacer, s, re.U)
#!/usr/bin/python #-*- coding: utf-8 -*- import os os.chdir('/home/caozhzh/work/rss') import re import urllib from xgoogle.BeautifulSoup import BeautifulSoup from xgoogle.browser import Browser, BrowserError from xgoogle.GeneralFetch import GeneralFetch url = "http://blog.sina.com.cn/u/1696709200" b = Browser() page = b.get_page(url) page = page.replace('<!–[if lte IE 6]>','') page = page.replace('<![endif]–>','') #print page be = BeautifulSoup(page) div = be.find('div', {'class': 'diywidget'}) txt = ''.join(div.findAll(text=True)) #print type(txt) import feedparser origin_feed = feedparser.parse('http://blog.sina.com.cn/rss/1696709200.xml') from feedformatter import Feed import time import datetime import uuid
def update_from_web(model, film, year): search = "kinopoisk.ru " + year + " " + film print "Search: %s" % search browser = Browser(debug=True) gs = GoogleSearch(search) gs.results_per_page = 1 results = gs.get_results() try: for res in results: pageurl = res.url.encode('utf8') page = browser.get_page(pageurl) soup = BeautifulStoneSoup( page[page.find("<html"):], convertEntities=BeautifulStoneSoup.HTML_ENTITIES, fromEncoding="windows-1251") print "URL: %s" % pageurl rating = soup.find('a', attrs={'class': 'continue'}) if rating: r = strip(rating).split(' ') try: model.rating = float(r[1]) print "Rating: %s" % r[1] except Exception, ex: model.rating = 0.0 print "Can't parse rating" title = soup.find('h1', 'moviename-big') if title: print "Title: %s" % strip(title) model.title = strip(title) info = soup.find('span', '_reachbanner_') if info: print "Info: %s" % strip(info) model.description = strip(info) img = soup.find('img', attrs={"width": "120"}) if img: print "Image: %s" % img['src'] model.image = "http://www.kinopoisk.ru%s" % img['src'] #getTrailer("t26538","397494/kinopoisk.ru-District-9-36971.mp4","397494/1_36971.jpg","480","270","tr","",""); import re m = re.search("getTrailer\((.*)\)", str(soup)) if not m: pass else: parts = m.group(1).split('"') url = "http://tr.kinopoisk.ru/%s" % parts[3] model.trailer = url image = "http://tr.kinopoisk.ru/%s" % parts[5] model.trailer_image = image print "Trailer: %s" % url print "TrailerImage: %s" % image break except Exception, e: print "WARNING: %s" % e