def parse_data(data): page = BeautifulSoup(data) results = page.find("div", id="res") if results is None: raise NoResultsException calc = results.find("img", src="/images/icons/onebox/calculator-40.gif") if calc is not None: calc = results.find("h2", {"class": "r"}) if calc is not None: superscripts = calc.find_all("sup") if superscripts is not None and len(superscripts): for x in superscripts: x.contents[0].replaceWith("^" + x.contents[0]) return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))] nresults = results.find_all("li", {"class": "g"}) if len(nresults) == 0: raise NoResultsException processed_results = [] for x in nresults: a_tag = x.find("a") if a_tag is not None: processed_results.append( dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0], text=util.strip_html(a_tag).decode("utf-8"))) return processed_results
def parse_data(data): page = BeautifulSoup(data) results = page.find("div", id="res") if results is None: raise NoResultsException calc = results.find("img", src="/images/icons/onebox/calculator-40.gif") if calc is not None: calc = results.find("h2", {"class": "r"}) if calc is not None: superscripts = calc.find_all("sup") if superscripts is not None and len(superscripts): for x in superscripts: x.contents[0].replaceWith("^" + x.contents[0]) return [ dict(type="string", string=util.strip_html(calc).decode("utf-8")) ] nresults = results.find_all("li", {"class": "g"}) if len(nresults) == 0: raise NoResultsException processed_results = [] for x in nresults: a_tag = x.find("a") if a_tag is not None: processed_results.append( dict(type="result", href=urlparse.parse_qs( urlparse.urlparse(a_tag["href"]).query)["q"][0], text=util.strip_html(a_tag).decode("utf-8"))) return processed_results
def fetch_title(callback, m): url = m.group() try: # SSL verify is for pussys who think PKI works ;) # it also shouldn't really matter for this particular # module, but stops anton from failing at requesting # some internal websites r = requests.get(url, verify=False) except RequestException as e: # we catch this so it doesn't bubble up as usually someone # just posted a malformed URL to IRC return "nope, didn't get it (%s)" % str(e) if r.status_code != requests.codes.ok: return if not (r.headers['Content-type'].startswith('text') or r.headers['Content-type'].startswith('application/xml')): return # BeautifulSoup's objection to being passed something like # a JPG as a unicode string seems to be to raise a UnicodeEncodeError. # I could catch that, but it feels nasty. Mind you, so does this... # (test-case: "http://jacovanstaden.files.wordpress.com/2011/03/git-flow-overview.jpg") try: if r.text[:1] != '<': return page = BeautifulSoup(r.text) except HTMLParser.HTMLParseError: return "Could not parse %s with BeautifulSoup. Shun the author." % url except TypeError: # This seems to be the case if r.text is, for example, an image. This can # still happen if a site sends a malformed Content-type header, but it # should be rare. return result = page.find("title") if result is not None: title = util.strip_html(result).decode("utf-8") if len(title) > 200: title = "%s..." % title[:197] return title return "Untitled (no <title> tag found)"
def parse_html(data): page = soup(data) t = page.find("a", {"class": "h1a"}) return util.strip_html(t)