def parse(self, url, data, fromEncoding=None): soup = BeautifulSoup(data, fromEncoding=(fromEncoding or "windows-1252")) title = soup.fetch("title")[0].string fnr = soup.fetch("h1")[0].string.split(".", 1)[0] m = DATE_RE.search(title) year = int(m.group(1)) mon = int(m.group(2)) day = int(m.group(3)) hour = int(m.group(4)) minu = int(m.group(5)) existing = Fundur.objects.filter(fnr=fnr) if existing: fn = existing[0] else: fn = Fundur() fn.titill = title fn.lth, fn.fnr = url_to_lth_fnr(url) fn.dags = "%4.4d-%2.2d-%2.2d %2.2d:%2.2d" % (year, mon, day, hour, minu) fn.save() return ScraperParserHTML.parse(self, url, data, soup=soup)
def parseMultiselect(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) aList = soup.fetch("a", {"href": "/weather/local/%"}) aList += soup.fetch("a", {"href": "/outlook/travel/local/%"}) aList += soup.fetch("a", {"href": "/outlook/travel/businesstraveler/local/%"}) lastCode = "" resultsCount = 0 for aItem in aList: afterLocal = aItem['href'].split("local/") if 2 == len(afterLocal): textAfterLocal = afterLocal[1] if 8 < len(textAfterLocal): code = textAfterLocal[:8] textAfterLocal = textAfterLocal[8:] if textAfterLocal.startswith("?from=search_"): if -1 == lastCode.find(code): lastCode += code text = getAllTextFromTag(aItem) resultsCount += 1 returned.append((text, code)) if 0 == resultsCount: return (LOCATION_UNKNOWN, None) return (LOCATION_MULTISELECT, universalDataFormatReplaceEntities(returned))
def parseMultiselect(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) aList = soup.fetch("a", {"href":"/weather/local/%"}) aList += soup.fetch("a", {"href":"/outlook/travel/local/%"}) aList += soup.fetch("a", {"href":"/outlook/travel/businesstraveler/local/%"}) lastCode = "" resultsCount = 0 for aItem in aList: afterLocal = aItem['href'].split("local/") if 2 == len(afterLocal): textAfterLocal = afterLocal[1] if 8 < len(textAfterLocal): code = textAfterLocal[:8] textAfterLocal = textAfterLocal[8:] if textAfterLocal.startswith("?from=search_"): if -1 == lastCode.find(code): lastCode += code text = getAllTextFromTag(aItem) resultsCount += 1 returned.append((text,code)) if 0 == resultsCount: return (LOCATION_UNKNOWN,None) return (LOCATION_MULTISELECT,universalDataFormatReplaceEntities(returned))
def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class": "obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F", "").strip() bItem = soup.first("b", {"class": "obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F", "").strip() tdList = soup.fetch("td", {"class": "obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%", "").strip() returned[5] = getAllTextFromTag(tdList[2]).replace( "in.", "inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F", "").strip() for r in returned: if r == None or r == "": return None return returned
def parseName(htmlTxt): # this is funy htmlTxt = htmlTxt.replace("<! -- ", "<!---") soup = BeautifulSoup() soup.feed(htmlTxt) # no results fontList = soup.fetch("font", {"face": "arial"}) for fontItem in fontList: iItem = fontItem.first("i") if iItem: if str(iItem.contents[0]).startswith("Your search for"): return (NO_RESULTS, sNoResultsText) # get table data trList = soup.fetch("tr", {"bgcolor": "#ffffff"}) resultsCount = 0 outerList = [] for trItem in trList: tdList = trItem.fetch("td") if 5 == len(tdList): symbol = getAllTextFromTag(tdList[0]).strip() url = tdList[0].first("a")['href'] name = getAllTextFromTag(tdList[1]).strip() market = getAllTextFromTag(tdList[2]).strip() industry = getAllTextFromTag(tdList[3]).strip() outerList.append((url, symbol, name, market, industry)) resultsCount += 1 # no results? if 0 == resultsCount: return (NO_RESULTS, sNoResultsText) return (STOCKS_LIST, universalDataFormatReplaceEntities(outerList))
def parseFirstDayHtml(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, "N/A"] bItems = soup.fetch("b", {"class":"obsTextA"}) if len(bItems) == 2: returned[0] = getAllTextFromTag(bItems[0]).strip() temp = getAllTextFromTag(bItems[1]).strip().split("Like ") if len(temp) > 1: returned[2] = temp[1].replace("°F","").strip() bItem = soup.first("b", {"class":"obsTempTextA"}) if bItem: returned[1] = getAllTextFromTag(bItem).replace("°F","").strip() tdList = soup.fetch("td", {"class":"obsTextA"}) if len(tdList) == 8: tdList = tdList[1::2] assert (len(tdList) == 4) returned[3] = getAllTextFromTag(tdList[0]).strip() returned[4] = getAllTextFromTag(tdList[1]).replace("%","").strip() returned[5] = getAllTextFromTag(tdList[2]).replace("in.","inches").strip() ##todo: down, up, ... returned[6] = getAllTextFromTag(tdList[3]).replace("°F","").strip() for r in returned: if r == None or r == "": return None return returned
def parse(wordtosearch): url = 'http://dictionary.reference.com/search?q=' + wordtosearch # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) # Read the main table, extracting the words from the table cells. maintable = soup.fetch('li') # There are 6 lines containg <li> at the bottom that we don't want to print # So we remove them from the list by adjustin the count removeli = len(maintable) - 6 counter = 0 # if removeli is 0 then we need to look for dl tags if removeli == 0: # fetch dl tags maintable = soup.fetch('dl') for defs in maintable: converttostring = str(defs) splitstring = converttostring.split('<dd>') removetrash = re.sub( '^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1]) addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: for counter in range(removeli): defs = maintable[counter] converttostring = str(defs) splitstring = converttostring.split('<li>') if len(splitstring) != 1: removetrash = re.sub( '^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1]) addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: removetrash = re.sub( '^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0]) addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition counter += 1
def parse_posts(self, posts): '''Parses posts from table/ajax_getposts.''' rposts = [] for post in posts: soup = BeautifulSoup(post) rposts.append({ 'date': soup.fetch('tr')[1].fetch('font')[0].text, 'link': soup.fetch('tr')[1].fetch('font')[1].a['href'], 'html': soup.fetch('tr')[2].fetch('td')[1], 'plaintext': soup.fetch('tr')[2].fetch('td')[1].text, }) return rposts
def action(self): resp = self.browser.open(self.BASE_URL) html = resp.read() soup = BeautifulSoup(html) news_chunk1 = soup.fetch('div',{'class':'item'}) news_chunk2 = soup.fetch('div',{'class':'headline'}) news_chunk3 = soup.fetch('div',{'class':'body-copy'}) for new in news_chunk1: head = new.findAll('a') for hd in head: print_ = str(hd).split('=') print print_[3].replace('''"thumb" src''','').replace('onclick', '').replace('"','').replace(' ', '')
def test_render_outline (self): c = template.Context({"package" : self.package}) t = template.Template(''' {% load mainpage_extras %} {% render_outline package %} ''') output = t.render(c) soup = BeautifulSoup(output) root = soup.find(attrs={'nodeid' : '1'}) self.assertTrue('Root' in root.contents[0]) self.assertEquals(len(soup.fetch('li')), 5) self.assertEquals(len(soup.fetch('a')), 5) self.assertEquals(len(soup.fetch('ul')), 3)
def test_render_outline(self): c = template.Context({"package": self.package}) t = template.Template(''' {% load mainpage_extras %} {% render_outline package %} ''') output = t.render(c) soup = BeautifulSoup(output) root = soup.find(attrs={'nodeid': '1'}) self.assertTrue('Root' in root.contents[0]) self.assertEquals(len(soup.fetch('li')), 5) self.assertEquals(len(soup.fetch('a')), 5) self.assertEquals(len(soup.fetch('ul')), 3)
def parse(wordtosearch): url = 'http://dictionary.reference.com/search?q=' + wordtosearch # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) # Read the main table, extracting the words from the table cells. maintable = soup.fetch('li') # There are 6 lines containg <li> at the bottom that we don't want to print # So we remove them from the list by adjustin the count removeli = len(maintable) - 6 counter = 0 # if removeli is 0 then we need to look for dl tags if removeli == 0: # fetch dl tags maintable = soup.fetch('dl') for defs in maintable: converttostring = str(defs) splitstring = converttostring.split('<dd>') removetrash = re.sub('^ |</dd.*dl>|<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[1]) addunderscores = re.sub('<u><i>|</i></u>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: for counter in range(removeli): defs = maintable[counter] converttostring = str(defs) splitstring = converttostring.split('<li>') if len(splitstring) != 1: removetrash = re.sub('^ |(<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>)', '', splitstring[1]) addunderscores = re.sub('(<u><i>|</i></u>)', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition else: removetrash = re.sub('^ |<li.*">|<ol.*">|<cite> </li>|<cite>|</cite>|</ol></li>|</li>|<a.*/a>', '', splitstring[0]) addunderscores = re.sub('<u><i>|</u></i>', '_', removetrash) convertampersands = re.sub('&', '&', addunderscores) definition = convertampersands print definition counter += 1
def reverseAreaCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) # results tableList = soup.fetch("table", {"id":"listings"}) if len(tableList) != 1: return UNKNOWN_FORMAT, None trList = tableList[0].fetch("tr") if len(trList) == 0: return UNKNOWN_FORMAT, None # ignore headers ([1:]) for trItem in trList[1:]: if 0 == len(trItem.fetch("tr")): # they have sth screwed with this <tr><tr> .... </tr></tr> tdList = trItem.fetch("td", {"id":"subtextid"}) if 3 == len(tdList): city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) smallList = (city,country,timezone) returned.append(smallList) elif 2 == len(tdList): city = getAllTextFromTag(tdList[0]) country = "" timezone = getAllTextFromTag(tdList[1]) smallList = (city,country,timezone) returned.append(smallList) if 0 == len(returned): return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) noResults = m411testNoResults(soup) if NO_RESULTS == noResults: return (noResults,m411NoResultsText) # results tableList = soup.fetch("table",{"id":"listings"}) if len(tableList) != 1: return UNKNOWN_FORMAT, None trList = tableList[0].fetch("tr") cityCodeList = [] for trItem in trList: if 0 == len(trItem.fetch("tr")): tdList = trItem.fetch("td", {"id":"subtextid"}) if 2 == len(tdList): if 0 == len(result): result.append([getAllTextFromTag(tdList[1])]) else: city = getAllTextFromTag(tdList[0]) code = getAllTextFromTag(tdList[1]) cityCodeList.append((city,code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) if 0 == len(result): return UNKNOWN_FORMAT, None return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
def __init__(self, url='http://www.synthetic.org/play.html'): self.playlists = [] page = BeautifulSoup(urlopen(url)) for link in page.fetch('a', {'href':re.compile(r'play/rsa\d+\.htm')}): iterMonthLists = iterMonthPlayLists(urljoin(url, link['href'])) try: self.playlists.extend(iterMonthLists) except ValueError: warn('unparsed link: %s' % link)
def reverseZIPCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # results (one? we handle more than one) tables = soup.fetch("table", {"summary":"Codes Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT,m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) if city != "New Search": smallList = (city,country,timezone) returned.append(smallList) elif len(tdList) == 2: #special case (911) city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) if city != "New Search": smallList = (city,country,"") returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def parseFirstDayHtmlYahoo(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, None] fontList = soup.fetch("font",{"face":"Arial", "size":"2"}) list = [] wasFeelsLike = False for f in fontList: text = getAllTextFromTag(f).strip() if wasFeelsLike: list.append(text) else: if text == "Feels Like:": list.append(text) wasFeelsLike = True if len(list) >= 16: smallList = list[1::2] returned[0] = "" returned[1] = smallList[0].replace("°","") returned[2] = smallList[0].replace("°","") returned[3] = smallList[3] returned[4] = smallList[4].replace("%","") returned[5] = smallList[2] returned[6] = smallList[1].replace("°","") returned[7] = smallList[6] for r in returned: if r == None: return None return returned
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # Country code(s) tableList = soup.fetch("table", {"summary":"Codes Results"}) if len(tableList) != 1: return (UNKNOWN_FORMAT,m411UnknownFormatText) # found country code tdListA = tableList[0].fetch("td",{"style":"%padding-left:5px;"}) tdListB = tableList[0].fetch("td",{"style":"%line-height:14pt;"}) if len(tdListA) != len(tdListB): return (UNKNOWN_FORMAT,m411UnknownFormatText) cityCodeList = [] for i in range(len(tdListA)): if 0 == i: result.append([getAllTextFromTag(tdListB[i])]) else: city = getAllTextFromTag(tdListA[i]) code = getAllTextFromTag(tdListB[i]) cityCodeList.append((city,code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) return (RESULTS_DATA,universalDataFormatReplaceEntities(result))
def parseCurrency(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) #TABLE WIDTH="100%" BORDER="0" CELLPADDING="0" CELLSPACING="0" BGCOLOR="#009900" #<TABLE WIDTH="100%" BORDER="0" CELLPADDING="1" CELLSPACING="1" BGCOLOR="#000000"> findTable = soup.fetch( "table", { "width": "100%", "border": "0", "cellpadding": "1", "cellspacing": "1", "bgcolor": "#000000" }) #print findTable if not findTable: return (UNKNOWN_FORMAT, currencyNoResultsText) itemTable = findTable[0] findTableTR = itemTable.fetch("tr") #Parse page and create dictionary for itemTR in findTableTR: findTD = itemTR.fetch("td") if 0 == len(findTD): continue if 4 != len(findTD): return (UNKNOWN_FORMAT, currencyNoResultsText) #print str(findTD[1].contents[0].contents[0].contents[0]) #print str(findTD[2].contents[0].contents[0]).replace(",","").strip() abbrev = str(findTD[1].contents[0].contents[0].contents[0]) g_AbbrevToRatesDict[abbrev] = float( str(findTD[2].contents[0].contents[0]).replace(",", "").strip()) g_AbbrevToRatesDict["USD"] = 1.0 return (RESULTS_DATA, g_AbbrevToRatesDict)
def parseGasOld(htmlTxt, url=None, dbgLevel=0): soup = BeautifulSoup() soup.feed(htmlTxt) testTitle = soup.first("title") if testTitle: if getAllTextFromTag(testTitle).startswith("GasBuddy.com - Find cheap gas prices in your city"): return (LOCATION_UNKNOWN, gLocationUnknownText) outerList = [] trList = soup.fetch("tr") for trItem in trList: tdList = trItem.fetch("td") if 8 == len(tdList): if tdList[1].first("table"): price = getAllTextFromTag(tdList[0]).strip() name = getAllTextFromTag(tdList[2]).strip() address = getAllTextFromTag(tdList[4]).strip() area = getAllTextFromTag(tdList[5]).strip() time = getAllTextFromTag(tdList[6]).strip() smallList = [price, name, address, area, time] outerList.append(smallList) else: if 0 != len(tdList): firstB = tdList[0].first("b") if firstB: if getAllTextFromTag(firstB).startswith("No gas prices found."): return (NO_RESULTS, gNoResultsText) if 0 == len(outerList): if dbgLevel > 0: print "len(outerList)==0" return parsingFailed(url, htmlTxt) return (GAS_DATA, universalDataFormatReplaceEntities(outerList))
def _search_serie_http(self, searchterms): """Search for a serie and return its episode list page URL""" # google power! url = "http://www.google.com/search?hl=en&q=site:epguides.com%%20%s" search = "%s %s" search = urllib.quote(search % (searchterms, '"(a Titles and Air Dates Guide)"')) f = urllib.urlopen(url % search) bs = BeautifulSoup(f) print bs if not bs: return False results = [] # tidy up the search results for url in bs.fetch("a", {"href":re.compile("http://epguides.com/")}): url = url['href'] # only add serie summary pages (don't end with .html) if url.endswith("/"): results.append(url) if not results: return False # The first result is (usually) the correct one return results[0]
def email(self): url = 'http://pgp.mit.edu/pks/lookup?op=index&search=%s' % self.artifact[ 'name'] try: status, response = get(url, headers=self.headers) if status: if 'No results found' in response.text: pass else: data = BeautifulSoup(response.text) hrefs = data.fetch('a') for href in hrefs: content = href.contents if self.artifact['name'] in content[0]: try: name = content[0].split('<')[0] if isinstance(self.artifact['data']['pgp'], list): self.artifact['data']['pgp'].append(name) else: self.artifact['data']['pgp'] = [] self.artifact['data']['pgp'].append(name) except IndexError: pass except: pass
def internationalCodeSearch(htmlTxt): result = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # Country code(s) tableList = soup.fetch("table", {"summary": "Codes Results"}) if len(tableList) != 1: return (UNKNOWN_FORMAT, m411UnknownFormatText) # found country code tdListA = tableList[0].fetch("td", {"style": "%padding-left:5px;"}) tdListB = tableList[0].fetch("td", {"style": "%line-height:14pt;"}) if len(tdListA) != len(tdListB): return (UNKNOWN_FORMAT, m411UnknownFormatText) cityCodeList = [] for i in range(len(tdListA)): if 0 == i: result.append([getAllTextFromTag(tdListB[i])]) else: city = getAllTextFromTag(tdListA[i]) code = getAllTextFromTag(tdListB[i]) cityCodeList.append((city, code)) # sort the (city,code) list by city cityCodeList.sort(sortByCityFunc) for el in cityCodeList: result.append(el) return (RESULTS_DATA, universalDataFormatReplaceEntities(result))
def parseList(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) # find table with results tableList = soup.fetch("table", {"cellpadding": "0", "cellspacing": "0", "border": "0", "width": "100%"}) if 0 == len(tableList): return (UNKNOWN_FORMAT, jUnknownFormatText) outerList = [] for table in tableList: trList = table.fetch("tr") if 2 <= len(trList): tdCount = len(trList[0].fetch("td")) if 3 > tdCount: return (UNKNOWN_FORMAT, jUnknownFormatText) for tr in trList[1:]: tdList = tr.fetch("td") rank = "" if 4 == tdCount: rank = getAllTextFromTag(tdList[0]) title = getAllTextFromTag(tdList[-3]) rating = getAllTextFromTag(tdList[-2]) explicitness = getAllTextFromTag(tdList[-1]) url = tdList[-3].first("a")["href"] if not url: return (UNKNOWN_FORMAT, jUnknownFormatText) outerList.append((rank, title, rating, explicitness, url)) if 0 == len(outerList): return (NO_RESULTS, jNoResultsText) return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
def reverseZIPCodeLookup(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) # results (one? we handle more than one) tables = soup.fetch("table", {"summary": "Codes Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT, m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) timezone = getAllTextFromTag(tdList[2]) if city != "New Search": smallList = (city, country, timezone) returned.append(smallList) elif len(tdList) == 2: # special case (911) city = getAllTextFromTag(tdList[0]) country = getAllTextFromTag(tdList[1]) if city != "New Search": smallList = (city, country, "") returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT, m411UnknownFormatText) return (RESULTS_DATA, universalDataFormatReplaceEntities(returned))
def parseFirstDayHtmlYahoo(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) # sky, now, feelsLike, wind, hum, preasure, dev point, visibility returned = [None, None, None, None, None, None, None, None] fontList = soup.fetch("font", {"face": "Arial", "size": "2"}) list = [] wasFeelsLike = False for f in fontList: text = getAllTextFromTag(f).strip() if wasFeelsLike: list.append(text) else: if text == "Feels Like:": list.append(text) wasFeelsLike = True if len(list) >= 16: smallList = list[1::2] returned[0] = "" returned[1] = smallList[0].replace("°", "") returned[2] = smallList[0].replace("°", "") returned[3] = smallList[3] returned[4] = smallList[4].replace("%", "") returned[5] = smallList[2] returned[6] = smallList[1].replace("°", "") returned[7] = smallList[6] for r in returned: if r == None: return None return returned
def fqdn(self): url = 'http://pgp.mit.edu/pks/lookup?op=index&search=%s' % self.artifact[ 'name'] try: status, response = get(url, headers=self.headers) if status: if 'No results found' in response.text: pass else: data = BeautifulSoup(response.text) items = data.fetch('a') for item in items: matches = re.findall(re_email, item) for m in matches: if isinstance(self.artifact['data']['pgp'], list): self.artifact['data']['pgp'].append(m) else: self.artifact['data']['pgp'] = [] self.artifact['data']['pgp'].append(m) self.artifact['children'].append({ 'name': m, 'type': 'email', 'source': 'PGP', 'subtype': None }) except: pass
def bot_w(mess, nick, botCmd): """Weather forecast""" if (len(botCmd) == 1): message = u"Usage: !weather + City Name or zip code" else: cityname = botCmd[1] url = 'http://search.weather.com.cn/static/url.php' values = {'cityinfo': cityname.encode('utf8')} data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req) the_page = response.read() url = the_page[the_page.find('URL=')+4:len(the_page)-3] request = urllib2.Request(url) page = urllib2.urlopen(request).read() message = 'Cannot find such a city' if page.find('<div class="box_contenttodayinwea" id="c_1_1">') != -1: soup = BeautifulSoup(page) message = str(soup.head.title) message = message[7:message.find('-')] + ', ' page = page[page.find('<div class="box_contenttodayinwea" id="c_1_1">'):] page = page[:page.find('</div>') + 6] soup = BeautifulSoup(page) ems = soup.fetch('em') for i in range(0, 3): message = message + re.sub('<(.|\n)+?>', '', str(ems[i])) if i < 2: message = message + ', ' return message
def parseDream2(htmlTxt): soup = BeautifulSoup() # TODO: this is temporary: htmlTxt = htmlTxt.replace( "/*<![CDATA[*/ @import \"/knowledge/stylesheets/monobook/main.css\"; /*]]>*/", "") soup.feed(htmlTxt) tableMain = soup.fetch("table", { "width": "768", "align": "center", "cellspacing": "0", "cellpadding": "0" }) if not tableMain: return (UNKNOWN_FORMAT, dUnknownFormatText) td = None for table in tableMain: tr = table.first("tr") if tr: tdTest = tr.first("td", {"width": "100%", "valign": "top"}) if tdTest: td = tdTest if not td: return (UNKNOWN_FORMAT, dUnknownFormatText) # why without this it is not working? soup2 = BeautifulSoup() soup2.feed(str(td).replace("<br />>", "")) td = soup2.first("td") # no results? if td.first("center"): return (NO_RESULTS, dNoResultsText) # results bTable = td.fetch("b") if not bTable: return (UNKNOWN_FORMAT, dUnknownFormatText) outerList = [] for bItem in bTable: title = getAllTextFromTag(bItem) next = getLastElementFromTag(bItem) pItem = None while next and not pItem: if isinstance(next, Tag): if next.name == "p": pItem = next next = next.next if pItem: text = getAllTextFromTagWithA(pItem.first("font")) if text.startswith("Interpretation: "): text = text[len("Interpretation: "):] outerList.append((title, text)) if 0 == len(outerList): return (NO_RESULTS, dNoResultsText) return (DREAM_DATA, universalDataFormatReplaceEntities(outerList))
def parseRandomQuotes(htmlTxt, modulesInfo): soup = BeautifulSoup() soup.feed(htmlTxt) quotes = [] dtList = soup.fetch("dt", {"class": "quote"}) ddList = soup.fetch("dd", {"class": "author"}) if len(dtList) == len(ddList) and len(dtList) > 0: for i in range(len(ddList)): quote = getAllTextFromTag(dtList[i]) next = ddList[i] bItem = None while next and None == bItem: next = next.next if isinstance(next, Tag): if next.name == "b": bItem = next elif next.name == "dt": next = None elif next.name == "select": next = None if bItem: aItem = bItem.first("a") if aItem: author = getAllTextFromTag(aItem) else: author = getAllTextFromTag(bItem) quotes.append([author, "\"" + quote.strip() + "\""]) if 0 == len(quotes): return UNKNOWN_FORMAT, None # build definition df = Definition() te = df.TextElement("Random Quotes", style=styleNamePageTitle) te.setJustification(justCenter) df.LineBreakElement() addQuotesToDefinition(df, quotes, modulesInfo) df.LineBreakElement() par = df.ParagraphElement(False) par.setJustification(justCenter) df.TextElement("Daily", link="s+quotes:daily") df.TextElement(" \x95 ", style=styleNameGray) df.TextElement("Random", link="s+quotes:random") df.PopParentElement() return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
def parseRandomQuotes(htmlTxt, modulesInfo): soup = BeautifulSoup() soup.feed(htmlTxt) quotes = [] dtList = soup.fetch("dt", {"class":"quote"}) ddList = soup.fetch("dd", {"class":"author"}) if len(dtList) == len(ddList) and len(dtList) > 0: for i in range(len(ddList)): quote = getAllTextFromTag(dtList[i]) next = ddList[i] bItem = None while next and None == bItem: next = next.next if isinstance(next, Tag): if next.name == "b": bItem = next elif next.name == "dt": next = None elif next.name == "select": next = None if bItem: aItem = bItem.first("a") if aItem: author = getAllTextFromTag(aItem) else: author = getAllTextFromTag(bItem) quotes.append([author, "\""+quote.strip()+"\""]) if 0 == len(quotes): return UNKNOWN_FORMAT, None # build definition df = Definition() te = df.TextElement("Random Quotes", style=styleNamePageTitle) te.setJustification(justCenter) df.LineBreakElement() addQuotesToDefinition(df, quotes, modulesInfo) df.LineBreakElement() par = df.ParagraphElement(False) par.setJustification(justCenter) df.TextElement("Daily", link="s+quotes:daily") df.TextElement(" \x95 ", style=styleNameGray) df.TextElement("Random", link="s+quotes:random") df.PopParentElement() return QUOTES_DATA, universalDataFormatWithDefinition(df, [])
def test_basic(self): self.open("http://localhost:8000/resources/Patient") self.update_text("#inputEmail", self.localhost_email) self.update_text("#inputPassword", self.localhost_password) self.click('button[type="Submit"]') self.wait_for_text_visible("reportforgenetics", "body") self.wait_for_text_visible("Condition", "body") self.click('button[name="authorize"]') self.wait_for_text_visible("Patient", "table") base_url = self.driver.current_url.split('/r')[0] source = self.driver.page_source soup = BeautifulSoup(source) num_rows = len(soup.fetch("a")) - 3 # Skip header, etc for i in xrange(num_rows): href = soup.fetch("a")[i+3].attrs[0][1] self.open(base_url + href) self.wait_for_text_visible("Genetics Report for", "h3") self.wait_for_text_visible("Clinical Context", "h4") self.wait_for_text_visible("Genetics Information", "body")
def test_basic(self): self.open("http://localhost:8000/resources/Patient") self.update_text("#inputEmail", self.localhost_email) self.update_text("#inputPassword", self.localhost_password) self.click('button[type="Submit"]') self.wait_for_text_visible("reportforgenetics", "body") self.wait_for_text_visible("Condition", "body") self.click('button[name="authorize"]') self.wait_for_text_visible("Patient", "table") base_url = self.driver.current_url.split('/r')[0] source = self.driver.page_source soup = BeautifulSoup(source) num_rows = len(soup.fetch("a")) - 3 # Skip header, etc for i in xrange(num_rows): href = soup.fetch("a")[i + 3].attrs[0][1] self.open(base_url + href) self.wait_for_text_visible("Genetics Report for", "h3") self.wait_for_text_visible("Clinical Context", "h4") self.wait_for_text_visible("Genetics Information", "body")
def cve(self,irc,msg,args): word= self._prepare_term(args[0],"-") if re.search('cve', word, re.IGNORECASE) == None: url = 'http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=' + word category = 'keyword' else: url = 'http://cve.mitre.org/cgi-bin/cvename.cgi?name=' +word category = 'name' # Read the URL and pass it to BeautifulSoup. html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) cveroot = "http://cve.mitre.org" # Read the main table, extracting the words from the table cells. hreftable = soup.fetch('a', {'href':re.compile('cvename')}, limit=4) h1table = soup.fetch('h1') h1string = str(h1table) if category == 'keyword': fonttable = soup.fetch('font', limit=11) else: fonttable = soup.fetch('font', limit=17) if (len(fonttable) == 3) or (re.search('error', h1string, re.IGNORECASE) != None): irc.reply("No data found regarding " + word) else: cve = [] href = [] ret = '' for line in fonttable: string = str(line) cve.append(re.sub('^.*">|</font>|\\n', '', string)) for line in hreftable: string = str(line) splitstring = string.split('>') #print splitstring href.append(re.sub('^.*="|"', '', splitstring[0])) ret = "%s %s" % (cve[3], cve[4]) if category == 'keyword': for link in href: ret += cveroot + link + " " else: ret +=cve[8] irc.reply(ret)
def obtener_titulo(url): page = urllib2.urlopen(url) soup = BeautifulSoup(page) ## print (soup.getText()) entrada = None for division in soup.fetch('div'): idntifica = division.get('id') if idntifica == "resultado": entrada = soup.find(id="resultado") entrada = entrada.getText() return entrada
def _get_seriedata(self, url): """Get serie name and all episodes from the given url @return serie name, episode data, last update day""" epdata = [] f = urllib2.urlopen(url) bs = BeautifulSoup(f) # this will fail if the serie page has been redirected # epguides uses a dimwitted meta refresh instead of a proper one... try: seriename = bs.fetch("h1")[0].renderContents() seriename = re.sub("<.*?>", "", seriename) # remove HTML except IndexError: return # parse just the relevant parts with regexes filedata = bs.fetch("pre")[0].renderContents().split("\n") # regex-match the relevant parts for line in filedata: m = self.epRe.search(line) if m: # convert datestring to gmtime format t = time.strptime(m.group('date'), '%d %b %y') # put episode data into a nifty dict data = {'epno' : m.group('no'), 'season' : int(m.group('season')), 'episode' : int(m.group('episode')), 'prodno' : m.group('prodno'), 'airdate' : m.group('date'), 'airdate2': datetime.date(t[0], t[1], t[2]), 'epname' : m.group('name')} epdata.append(data) # name of serie, episode data and date of last check return seriename, epdata, datetime.date.today()
def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords): soup = BeautifulSoup() soup.feed(htmlTxt) # no results input = soup.first("input", {"name": "albumName"}) if input: return NO_RESULTS, None # get td's headerList = soup.fetch("td", {"class": "tb_header"}) tdList = soup.fetch("td", {"class": "tb_row_r2"}) if len(headerList) == 0 or len(tdList) == 0: return UNKNOWN_FORMAT, None # test modulo offset headersCount = len(headerList) if (len(tdList) % headersCount) != 0: return UNKNOWN_FORMAT, None searchResults = [] # get results for index in range(len(tdList) - 1): artist = getAllTextFromTag(tdList[index]).strip() title = getAllTextFromTag(tdList[index + 1]).strip() urlStart = "show.php?id=" aItem = tdList[index + 1].first("a", {"href": urlStart + "%"}) if aItem: lyricsId = aItem['href'][len(urlStart):] searchResults.append([artist, title, lyricsId]) if 0 == len(searchResults): return (UNKNOWN_FORMAT, None) if fArtistSearch: df = searchResultsToDefinitionThree(searchResults, modulesInfo) else: #df = searchResultsToDefinition(searchResults, modulesInfo) df = searchResultsToDefinitionTwo(searchResults, modulesInfo) return LYRICS_SEARCH, universalDataFormatWithDefinition( df, [["H", "Search: " + keywords]])
def tryParseSearchDefinition(htmlTxt, fArtistSearch, modulesInfo, keywords): soup = BeautifulSoup() soup.feed(htmlTxt) # no results input = soup.first("input", {"name":"albumName"}) if input: return NO_RESULTS, None # get td's headerList = soup.fetch("td", {"class":"tb_header"}) tdList = soup.fetch("td", {"class":"tb_row_r2"}) if len(headerList) == 0 or len(tdList) == 0: return UNKNOWN_FORMAT, None # test modulo offset headersCount = len(headerList) if (len(tdList) % headersCount) != 0: return UNKNOWN_FORMAT, None searchResults = [] # get results for index in range(len(tdList)-1): artist = getAllTextFromTag(tdList[index]).strip() title = getAllTextFromTag(tdList[index+1]).strip() urlStart = "show.php?id=" aItem = tdList[index+1].first("a", {"href":urlStart+"%"}) if aItem: lyricsId = aItem['href'][len(urlStart):] searchResults.append([artist, title, lyricsId]) if 0 == len(searchResults): return (UNKNOWN_FORMAT, None) if fArtistSearch: df = searchResultsToDefinitionThree(searchResults, modulesInfo) else: #df = searchResultsToDefinition(searchResults, modulesInfo) df = searchResultsToDefinitionTwo(searchResults, modulesInfo) return LYRICS_SEARCH, universalDataFormatWithDefinition(df, [["H", "Search: "+keywords]])
def iterMonthPlayLists(url): page = BeautifulSoup(urlopen(url)) wordRE = re.compile(r'\w+') for table in page.fetch('table'): rows = iter(table.fetch('tr')) # first row: the date month,day,year = map(int,''.join(rows.next().fetchText(wordRE)).split('/')) # second row: title,artist labels; ignore it rows.next() # remaining rows : title,artist data selections = [[''.join(col.fetchText(wordRE)).strip() for col in row.fetch('td')] for row in rows] yield PlayList(selections, Date(year,month,day))
def getSearchLinks(text): "Returns all the links from scraping a google custom search page, given a html document as input" soup = BeautifulSoup(text) allLinks = soup.findAll('h2', {"class":"r"}) #gets all the h2 tags search urls allLinks = map(str, allLinks) fsoup = BeautifulSoup(''.join(allLinks)) links = [] #start with an empty list for item in fsoup.fetch('a'): links.append(item['href']) #iterate over all the href elements and append to the links list links = map(lambda x: x.encode('ascii'),links) #BeautifulSoup returns unicode strings, hence converting to ascii return links
def getAllLinks(htmlpage, reg = False): links = [] if reg == False: soup = BeautifulSoup(htmlpage) for item in soup.fetch('a'): links.append(item['href']) return encodeToAscii(links) else: linksList = re.findall('<a href=(.*?)>.*?</a>',str(htmlpage)) for link in linksList: links.append(link) return links
def getallsubs(content, allowed_languages, filename="", search_string=""): #parser = HTMLParser() # parser = et.XMLParser(html=1) # html = et.fromstring(content, parser).getroot() # html = ElementSoup.parse(StringIO(content)) soup = BeautifulSoup(content) #elements = html.findall(".//tr[./td/a/img[@title='Download Thai Subtitle']]") subtitles = [] sub_list = soup.fetch("div", dict(id="subtitle_list")) if not sub_list: return [] table = sub_list[0].fetch("table")[0] if table is None: return [] for element in table.findAll("tr")[1:]: num, title, rating, translate, upload, download = element.findAll("td") subtitle_name = title.find('br').previousSibling.strip().strip(" [En]") rating = int(round(float(rating.getText().strip('%'))/100.0*5)) sync = False if filename != "" and string.lower(filename) == string.lower(subtitle_name): sync = True for lang_name, _, let2, let3, _, _ in [ ("Thai", "0", "th", "tha", "41", 30243), ("English", "2", "en", "eng", "11", 30212) ]: if let3 not in allowed_languages: continue # rating is really the completeness. 0 means no thai, so not point showing it if let3 == 'tha' and rating < 1: continue link = download.fetch("img",{'title':'Download %s Subtitle'%lang_name})[0].parent['href'] link = urljoin(MAIN_URL + "/manage/", link) lang = {'name': lang_name, '2let': let2, '3let': let3} subtitles.append({'rating': str(rating), 'filename': subtitle_name, 'sync': sync, 'link': link, 'lang': lang, 'hearing_imp': False}) log(__name__, "got %s results" % len(subtitles)) # subtitles.sort(key=lambda x: [not x['sync']]) return subtitles
def findVideos(domain, sourceCode, videos, verbose): gotSeason = False gotURL = False printInfo1("\nSearching for links in code with the word '%s' in the url..." % videoText) soup = BeautifulSoup(sourceCode) for item in soup.fetch(['h2', 'a']): if verbose: printInfo1("\nParsing line: %s\n..." % item) if item.contents: if item.name == "h2" and seasonText in item.contents[0]: season = HTMLParser().unescape(item.contents[0]) if verbose: printInfo2("Found season text") printInfo1("Season: %s" % season) gotSeason = True if item.name == "a" and videoText in item['href']: episodeTitle = HTMLParser().unescape(item['title']) url = item['href'] if verbose: printInfo2("Found link to video") printInfo1("Episode title: %s" % episodeTitle) printInfo1("URL: %s" % url) gotURL = True if not gotSeason and not gotURL: if verbose: printInfo2("No valuable info in this item") if gotURL: if not gotSeason: season = "None" url = urljoin(domain, url) if verbose: printInfo1("Adding...") printInfo1("URL: %s" % url) printInfo1("Season: %s" % season) printInfo1("Episode title: %s" % episodeTitle) videos.append({'url': url, 'season': season, 'episodeTitle': episodeTitle}) gotSeason = False gotURL = False printInfo1("Found %s videos" % len(videos)) return videos
def councilMembers(self, follow_links=True): br = self._get_new_browser() response = br.open(self._people_uri) # Loop through the pages, yielding each of the results all_results = False while all_results is False: soup = BeautifulSoup(response.read()) table = soup.find('table', id='ctl00_ContentPlaceHolder1_gridPeople_ctl00') for councilman, headers, row in self.parseDataTable(table): if follow_links and type(councilman['Person Name']) == dict: detail_url = self.host + councilman['Person Name']['url'] response = br.open(detail_url) soup = BeautifulSoup(response.read()) img = soup.find( 'img', {'id': 'ctl00_ContentPlaceHolder1_imgPhoto'}) if img: councilman['Photo'] = self.host + img['src'] yield councilman current_page = soup.fetch('a', {'class': 'rgCurrentPage'}) if current_page: current_page = current_page[0] next_page = current_page.findNextSibling('a') else: next_page = None if next_page: print 'reading page', next_page.text print event_target = next_page['href'].split("'")[1] br.select_form('aspnetForm') data = self._data(br.form, event_target) del data[ 'ctl00$ContentPlaceHolder1$gridPeople$ctl00$ctl02$ctl01$ctl01'] # print data data = urllib.urlencode(data) response = _try_connect(br, self._people_uri, data) else: all_results = True raise StopIteration
def parseStock(htmlTxt): # this is funy htmlTxt = htmlTxt.replace("<! -- ", "<!---") soup = BeautifulSoup() soup.feed(htmlTxt) noResults = testNoResults(soup) if NO_RESULTS == noResults: return (NO_RESULTS, sNoResultsText) # get name nameTag = soup.first("td", {"height": "30", "class": "ygtb"}) if not nameTag: return (UNKNOWN_FORMAT, sUnknownFormatText) name = getAllTextFromTag(nameTag).strip() # get all data from table bigTable = soup.fetch("table", {"width": "580", "id": "yfncsumtab"}) if 1 != len(bigTable): return (UNKNOWN_FORMAT, sUnknownFormatText) tdDataList = bigTable[0].fetch("td", {"class": "yfnc_tabledata1"}) innerList = [name] counter = 0 for tdItem in tdDataList: if 2 == counter: # 3th element is with up down icon imgItem = tdDataList[2].first("img") upDown = "" if imgItem: upDown = imgItem['alt'] innerList.append(upDown) bItem = tdDataList[2].first("b") itemText = "" if bItem: itemText = getAllTextFromTag(bItem).strip() innerList.append(itemText) else: itemText = getAllTextFromTag(tdItem).strip() innerList.append(itemText) counter += 1 # any results? if 0 == counter: return (UNKNOWN_FORMAT, sUnknownFormatText) # one-item UDF outerList = [innerList] return (STOCKS_DATA, universalDataFormatReplaceEntities(outerList))
def GetLinks(root, patterns, isnum = True): u = urllib.urlopen(root) soup = BeautifulSoup(u.read()) refs = [] for p in patterns: refs += soup.fetch('a', {'href': re.compile(p)}) links = [] numbers = [] for ref in refs: if isnum: if ref.text[:-1].isdigit(): links.append(root + str(ref.text)) numbers.append(int(str(ref.text)[:-1])) else: links.append(root + str(ref.text)) if isnum: return zip(links, numbers) return links
def check_class_open(crns): open_crns = [] for crn in crns: url = CRN_URL_TEMPLATE.format(crn) r = requests.get(url) soup = BeautifulSoup(r.text) # find available table column from the given attrs. the first returned is the available seats, the 2nd returned is the max seats available = soup.fetch(attrs={'class': 'dddefault', 'width': 30})[0] if available.text != u'0': open_crns.append(crn) return open_crns
def scrape_somafm_info(url): station_list = [] page = urllib2.urlopen(url) html = page.read() soup = BeautifulSoup(html) for station in soup.fetch('li'): station_name = station.h3.contents[0] station_desc = ' '.join(station('p', {'class': 'descr'})[0].contents) station_desc = format_station_description(station_desc) station_url = station.a['href'] pls_name_info = station_url.rsplit('/') pls_url = '%s%s%s' % (url, pls_name_info[2], '.pls') irs = InternetRadioStation(name=station_name, pls_url=pls_url, desc=station_desc) station_list.append(irs) return station_list
def scrape_somafm_info(url): station_list = [] page = urllib2.urlopen(url) html = page.read() soup = BeautifulSoup(html) for station in soup.fetch('li'): station_name = station.h3.contents[0] station_desc = ' '.join(station('p', {'class':'descr'})[0].contents) station_desc = format_station_description(station_desc) station_url = station.a['href'] pls_name_info = station_url.rsplit('/') pls_url = '%s%s%s' % (url, pls_name_info[2], '.pls') irs = InternetRadioStation(name=station_name, pls_url=pls_url, desc=station_desc) station_list.append(irs) return station_list
def areaCodeByCity(htmlTxt): returned = [] soup = BeautifulSoup() soup.feed(htmlTxt) res = testNoResults(soup) if RESULTS_DATA != res: return (res, None) resultsTable = soup.first("table", {"summary":"Results Content"}) if resultsTable: strong = resultsTable.first("strong") if strong: if getAllTextFromTag(strong).startswith("Multiple cities with"): aList = resultsTable.fetch("a") for aItem in aList: city = getAllTextFromTag(aItem) returned.append(city) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (MULTIPLE_SELECT,string.join(returned,"\n")) # results return reverseZIPCodeLookup(htmlTxt) tables = soup.fetch("table", {"summary":"Search Results"}) if 0 == len(tables): return (UNKNOWN_FORMAT,m411UnknownFormatText) trList = [] for tab in tables: trList += tab.fetch("tr") for tr in trList: tdList = tr.fetch("td") if len(tdList) == 3: code = getAllTextFromTag(tdList[0]).strip() country = getAllTextFromTag(tdList[1]).strip() timezone = getAllTextFromTag(tdList[2]).strip() if code != "New Search": smallList = (code,country,timezone) returned.append(smallList) if len(returned) == 0: return (UNKNOWN_FORMAT,m411UnknownFormatText) return (RESULTS_DATA,universalDataFormatReplaceEntities(returned))
def click_link_text(self, link_text, timeout=settings.SMALL_TIMEOUT): """ This method clicks link text on a page """ # If using phantomjs, might need to extract and open the link directly if self.browser == 'phantomjs': if self.is_link_text_visible(link_text): element = self.wait_for_link_text_visible(link_text) element.click() return source = self.driver.page_source soup = BeautifulSoup(source) html_links = soup.fetch('a') for html_link in html_links: if html_link.text == link_text: for html_attribute in html_link.attrs: if html_attribute[0] == 'href': href = html_attribute[1] if href.startswith('//'): link = "http:" + href elif href.startswith('/'): url = self.driver.current_url domain_url = self.get_domain_url(url) link = domain_url + href else: link = href self.open(link) return raise Exception( 'Could not parse link from link_text [%s]' % link_text) raise Exception("Link text [%s] was not found!" % link_text) # Not using phantomjs element = self.wait_for_link_text_visible(link_text, timeout=timeout) self._demo_mode_highlight_if_active(link_text, by=By.LINK_TEXT) pre_action_url = self.driver.current_url element.click() if settings.WAIT_FOR_RSC_ON_CLICKS: self.wait_for_ready_state_complete() if self.demo_mode: if self.driver.current_url != pre_action_url: self._demo_mode_pause_if_active() else: self._demo_mode_pause_if_active(tiny=True)
def define(self, irc, msg, args): """[word] look up the word in wordnet""" if len(args) != 1: irc.reply("you gotta give me a word to define") return word = self._prepare_term(args[0], "") url = 'http://wordnet.princeton.edu/perl/webwn?s=' + word html = urllib2.urlopen(url).read() soup = BeautifulSoup() soup.feed(html) maintable = soup.fetch('li') retdef = [] checkfordefs = len(maintable) if checkfordefs != 0: for lines in maintable: converttostring = str(lines) definition = re.sub('^.*\(|\).*$', '', converttostring) retdef.append(definition) else: retdef.append("not found. Is %s spelled corectly?" % word) irc.reply(word + ": " + "; ".join(retdef))
def _parseList(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) trList = soup.fetch("tr") outerList = [] for tr in trList: if len(tr.fetch("tr")) == 0: tdList = tr.fetch("td") if len(tdList) == 4: if tdList[0].first("span", {"class": "title"}): rank = getAllTextFromTag(tdList[0]) title = getAllTextFromTag(tdList[1]) rating = getAllTextFromTag(tdList[2]) explicitness = getAllTextFromTag(tdList[3]) aItem = tdList[1].first("a") if aItem: url = aItem['href'] outerList.append( (rank, title, rating, explicitness, url)) if 0 == len(outerList): return (NO_RESULTS, jNoResultsText) return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
def parseList(htmlTxt): soup = BeautifulSoup() soup.feed(htmlTxt) # find table with results tableList = soup.fetch("table", { "cellpadding": "0", "cellspacing": "0", "border": "0", "width": "100%" }) if 0 == len(tableList): return (UNKNOWN_FORMAT, jUnknownFormatText) outerList = [] for table in tableList: trList = table.fetch("tr") if 2 <= len(trList): tdCount = len(trList[0].fetch("td")) if 3 > tdCount: return (UNKNOWN_FORMAT, jUnknownFormatText) for tr in trList[1:]: tdList = tr.fetch("td") rank = "" if 4 == tdCount: rank = getAllTextFromTag(tdList[0]) title = getAllTextFromTag(tdList[-3]) rating = getAllTextFromTag(tdList[-2]) explicitness = getAllTextFromTag(tdList[-1]) url = tdList[-3].first("a")['href'] if not url: return (UNKNOWN_FORMAT, jUnknownFormatText) outerList.append((rank, title, rating, explicitness, url)) if 0 == len(outerList): return (NO_RESULTS, jNoResultsText) return (JOKES_LIST, universalDataFormatReplaceEntities(outerList))
def get_all_images_on_page(page_url): prefix = page_url.split('://')[0] simple_url = page_url.split('://')[1] base_url = simple_url.split('/')[0] full_base_url = prefix + "://" + base_url + "/" html = requests.get(page_url) completed_source = web_core.rebuild_source(html.text, full_base_url) soup = BeautifulSoup(completed_source) imgs = soup.fetch('img', src=True, onload=None) image_url_list = [] for img in imgs: link = img["src"].split("src=")[-1] compact_link = link.split('?')[0] if (compact_link.endswith('.png') or compact_link.endswith('.jpg') or compact_link.endswith('.jpeg')): if not link.startswith("http"): if ":" not in link: link = full_base_url + link else: # The link is weird. Skip it. continue image_url_list.append(link) return image_url_list