def create_cmml(html, ogg_file): soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) time_re = text=re.compile("\\d{1,2}(:\\d{2}){2}") times = soup.findAll(text=time_re) if len(times) > 0: m = re.match('(.*)\\.[^\\.]+$',ogg_file) if m is not None: to_file = m.group(1) + ".cmml" cmml = ET.Element('cmml',attrib={'lang':'en'}) remove_ws = re.compile('\s+') for t in times: txt = '' for c in t.parent.findAll(text=True): if c is not t: txt += c txt = remove_ws.sub(' ', txt) txt = txt.strip() log("found chapter %s at %s"%(txt,t)) # totem want's escaped html in the title attribute (not & but &) txt = txt.replace('&','&') clip = ET.Element('clip') clip.set('id',t) clip.set( 'start', ('npt:'+t)) clip.set('title',txt) cmml.append(clip) ET.ElementTree(cmml).write(to_file,encoding='utf-8')
def pharseContact(self, text): kv_dic = TK56Contact.get_k_v_dic() result = {} soup = BeautifulSoup(text) soup.prettify() table = soup.findAll(attrs={"class": "st-tab"})[1] tds = table.findAll("td") allInfo = {} for info in tds: info = ("".join(info.fetchText(True))).strip().replace(" ", "") if not info: continue key = info.split(":")[0] value = info.replace(key + ":", "") allInfo[key] = value contacts = None for k, v in allInfo.items(): if k.startswith("联系方式"): if contacts: contacts = contacts + "," + v else: contacts = v allInfo.pop(k) allInfo[u"联系方式"] = contacts for k, v in allInfo.iteritems(): if k: result[kv_dic.get(k)] = v return result
def login(self): """ Perform the actual login. This method takes the username and password passed in when the class was initialized. It then creates a dictionary with login information. This dictionary is passed into urllib2 to create a Request, which is then passed to ClientCookie.urlopen. This method returns a loginResponse, which is the source code from the default Iodine module. """ try: # Just in case we're trying to run without an Internet connection or something usernameKey = 'login_username' # Defines the username field name passwordKey = 'login_password' # Defines the password field name loginUrl = "https://iodine.tjhsst.edu" # Defines the URL that the request will use loginInformation = {usernameKey: self.username, passwordKey: self.password} # Creates a request dictionary loginInformation = urllib.urlencode(loginInformation) # Encode the login information. loginRequest = urllib2.Request(loginUrl, loginInformation) # Creates a Request that is used to login loginResponse = ClientCookie.urlopen(loginRequest) # Sends the login to Iodine and stores the PHP session ID. loginResponse = loginResponse.read() # Get the HTML/XML from Iodine. webpage = BeautifulSoup(loginResponse) # Set up a Beautiful Soup object eighthChangeUrl = webpage.find(id="menu_eighth")['href'] # Grab the eighth period change URL uid = eighthChangeUrl.split("uid/")[1] # Get the UID based on the eighth period change URL self.uid = uid # And set the uid as a class variable, effectively getting the UID for changing things self.isAuthenticated = True # Yes, yes we are logged in. return True # Yay, no error! except Exception, e: # If we failed for whatever reason... self.uid = None # Set the uid to none. self.isAuthenticated = False # No, no we are not. print e raise Exception("Error in Authenticator: could not log in.") # Raise an exception. raise IodineException("Error in Authenticator: could not log in.", "ERR_AUTHENTICATE_LOGIN") # Raise an IodineException
class AllChannels: """ This is the class scrap the BASE_URL! """ def __init__(self): self.soup = BeautifulSoup(urllib.urlopen(ALL_CHANNELS_URL)) def getChannels(self): channels = [] ch = {} for c, div in enumerate(self.soup.findAll("div", {"class": re.compile(r'two columns.*')})): ch["link" + str(c)] = div.a['href'] ch["name" + str(c)] = div.find("a").text ch["title" + str(c)] = div.a['title'] channels.append(ch) return channels def downloadIcons(self): if not os.path.exists("icons"): os.mkdir("icons") for div in self.soup.findAll("div", {"class": re.compile(r'two columns.*')}): channel_url = div.a['href'] icon_url = div.find("img")["src"] if not os.path.isfile("icons" + os.sep + str(channel_url + ".png")): try: urllib.urlretrieve(str(BASE_URL + icon_url), "icons" + os.sep + str(channel_url + ".png")) except: pass
def get_ip_locate(ip_address): errf = open(err_output_file,'a') opener = build_opener() result_info = '' avaliable_list = [] err_list = [] result_list = [] url = 'http://www.ip138.com/ips.asp?ip=%s&action=2' %ip_address page = opener.open(url).read() soup = BeautifulSoup(page) a = soup.findAll('table')[2].findAll("tr")[2].findAll('li') for i in a: k = i.string try: j = get_location_info_from_nodist(k,ip_address) result_list.append(j) except: err_address = "%s|%s"%(k,ip_address) errf.write("%s\n"%err_address.encode('utf-8')) if '省' in result_list[0] or '市' in result_list[0] or '区' in result_list[0]: result_info = result_list[0] elif '省' not in result_list[0] or '市' not in result_list[0]: for i in result_list: if '省' in i or '市' in i: avaliable_list.append(i) result_info = avaliable_list[0] return result_info
def process_content(content,award_type): # Need to process content and return a hash of the form: # {"type" : "actor", "win_type":True, "name":"some_name","movie":"movie name"} acadamy_awards = [] soup = BeautifulSoup(content) all_divs = soup.find_all('div',class_='nomHangIndent') all_trs = [] for div in all_divs: all_trs.append(div.parent.parent) # print all_trs for tr in all_trs: temp_row = {} st, win_star, person_details = tr.contents div, st = person_details.contents hrefs = div.find_all('a') for href in hrefs: if href.parent.name == 'div': name = href.string name = re.sub('Written| by|Screenplay|Original|Story','',name) name = re.sub('&|;| and',',',name) name = re.sub('\[[0-9]*\]|\[|\]|\'|novel|see Cast|original screenplay|Adapted for the screen|','',name) name = re.sub(r'\\n',',',name) name = re.sub(',\s*,',',',name) name = re.sub(', Jr.',' Jr.',name) for tag in tr.parent.parent.previous_elements: if tag.name == 'dt': dt = tag try: year = dt.u.a.string except Exception, e: year = dt.a.string finally: year = re.sub(" \([0-9]+[a-z]+\)",'',year)
def rottentomatoScraper(movieData): query = urllib.urlencode ({'q' : movieData + " rotten tomatoes"}) jsonObj = None # while jsonObj['responseData'] == None: response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() jsonObj = json.loads(response) # try: result = jsonObj['responseData']['results'] # print("Hey") # except: # pass url = result[0]['url'] r = urllib.urlopen(url).read() soup = BeautifulSoup(r) #add image, cast members, genre, reviews criticConsensus = soup.find("p", {"class": "critic_consensus superPageFontColor"}).contents[2].strip() criticRating = soup.find("span", {'itemprop': "ratingValue"}).contents[0].strip() userRating = soup.find("span", {'itemprop': "ratingValue"}).contents[0].strip() director = soup.find("span", {"itemprop": "name"}).contents[0].strip() cast = [span.contents[0].strip() for span in soup.findAll("span", {'itemprop': "name"})[1:4]] genres = [span.contents[0].strip() for span in soup.findAll("span", {'itemprop': "genre"})] image = soup.find("img", {"class": " posterImage"})["src"] r = urllib.urlopen(url + "/reviews/?type=top_critics").read() soup = BeautifulSoup(r) reviews = [div.contents[0].strip() for div in soup.findAll("div", {"class": "the_review"})[:6]] return {"criticConsensus": criticConsensus, "criticRating": criticRating, "userRating": userRating, "cast": cast, "image": image, "reviews": reviews, "director": director, "genres": genres}
def getCheYuanPage(self, page): data = { "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBakBaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2Nhcl9saXN0LmFzcHglM2Z0eXBlJTNkMiUyNnByb3ZpbmNlJTNkLTElMjZjaXR5JTNkLTElMjZrd29yZCUzZB8AZxYCHgNyZWwFCG5vZm9sbG93ZAICDw8WAh8BBWtodHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tZW1iZXIvbG9naW4uYXNweD9nb3VybD1odHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tYW5hZ2UvbWJfYnVzaW5lc3MuYXNweBYCHwIFCG5vZm9sbG93ZAIDDw8WCh4EVGV4dAUS5p+l6K+i6L+Y5qy+6K6w5b2VHghDc3NDbGFzcwUKbmF2Rm9udFJlZB8BBSJodHRwOi8vZnEua3l3bWFsbC5jb20vaW5kZXhmcS5hc3B4HgZUYXJnZXQFBl9ibGFuax4EXyFTQgICFgIfAgUIbm9mb2xsb3dkAgQPDxYEHwEFrAFodHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tZW1iZXIvcmVnaXN0ZXIuYXNweD9nb3VybD1odHRwJTNhJTJmJTJmd2wua3l3bWFsbC5jb20lMmZ3bF9zZWFyY2glMmZ3bF9zZWFyY2hfY2FyX2xpc3QuYXNweCUzZnR5cGUlM2QyJTI2cHJvdmluY2UlM2QtMSUyNmNpdHklM2QtMSUyNmt3b3JkJTNkHwBnFgIfAgUIbm9mb2xsb3dkAgUPD2QWAh8CBQhub2ZvbGxvd2QCBQ9kFgQCCQ8WAh4LXyFJdGVtQ291bnQCChYWZg9kFgJmDxUEBjM0NTY2Ng/nlr7otbDvvIzlv6vpgJ8LMTM1MjkyOTA3OTUQMjAxNC0wNS0wOSAwODoyM2QCAQ9kFgJmDxUEBjM0NTY1NVTmsrPljZfpg5Hlt57kuozkuIPljLotPuays+WNl+mDkeW3nu+8jOaciTHovoY057GzM+WQqOi9pu+8jOaxguaVtOi9pu+8jOS7t+agvOmdouiurjsNMTM2MzM4NjgxOTUgIBAyMDE0LTA1LTA1IDE0OjUxZAICD2QWAmYPFQQGMzQ1NjMwN+i+veWugeayiOmYsy0+6L695a6B5rKI6Ziz77yM5pyJMei+hjQuMuexs+i9pu+8jOaxgui0pzsNMTMzODY4NjMwMDggIBAyMDE0LTA0LTI0IDA3OjUwZAIDD2QWAmYPFQQGMzQ1NjI3sgHmuZbljZflsrPpmLPkuLTmuZjluIIgLT4g5bm/5Lic5bm/5bee55m95LqR5Yy6LOacieWbnueoizkuNuexs+WJjeWbm+WQjuWFq+i9pjPovoYs5rGC6LSnO+a5luWNl+Wys+mYs+S4tOa5mOW4giAtPiDlub/kuJzlub/lt57nmb3kupHljLos5pyJ5Zue56iLOS4257Gz5YmN5Zub5ZCO5YWr6L2mM+i+hizmsYLotKc7CzE4NjczMDE4NzI5EDIwMTQtMDQtMTkgMDc6NDNkAgQPZBYCZg8VBAYzNDU2MjZF5rGC5LiT57q/6LSn77yM6L+Q6LS55Y+v5Lul5ZWG6YeP44CC6YWN6LSn6LS55aW96K+077yM5pyJ5Zue6LSn5pu05aW9CzEzMzYzNzM4MDg4EDIwMTQtMDQtMTcgMTg6MzdkAgUPZBYCZg8VBAYzNDU2MDKfAeaIkeacieS4pOi+huWQjuWFq+i9rjYuMuWbnueoi+i9puWOu+aWsOeWhuWcsOWMuuWuieW+veS6s+W3nuiwr+WfjuWMui0+5paw55aG5LmM6bKB5pyo6b2Q5paw5biC5Yy677yM5pyJMui+hjYuMuexszEw5ZCo6L2m77yM5rGC6K6+5aSH6ZKi5p2Q5ZCo77yM5Lu35qC86Z2i6K6uOw0xODA1Njc2MTYyNyAgEDIwMTQtMDMtMjQgMTk6NTJkAgYPZBYCZg8VBAYzNDU1OTRN5rKz5YyX6YKi5Y+w5a6B5pmL5Y6/LT7msrPljZfpg5Hlt57vvIzmnIkxMuexs+i9pu+8jOaxguaVtOi9pu+8jOS7t+agvOmdouiurjsNMTg4MzE5NTgyNjAgIBAyMDE0LTAzLTIyIDE1OjQ5ZAIHD2QWAmYPFQQGMzQ1NTc5XOWNiuaMgui9pu+8jDEz57Gz6ZW/77yMNDDlkKjvvIzluLjlubTot5Hov5DovpPvvIznlLXor53vvZ4xODkwMzkxODI3Nuays+WNl+eEpuS9nC0+6ZmV6KW/77yMDTE1OTkzNzI2NzcwICAQMjAxNC0wMy0yMCAxODozOGQCCA9kFgJmDxUEBjM0NTU1NwnljYrmjILovaYNMTU5OTM3MjY3NzAgIBAyMDE0LTAzLTEwIDEwOjI3ZAIJD2QWAmYPFQQGMzQ1NTU2DeS7t+agvOmdouiuriAbMTg2MDU3NTU3NDYgIOW/hei+vui0p+i/kCAgEDIwMTQtMDMtMDggMTg6MTBkAgoPZBYCAgEPFgIfAGgWAmYPZBYCAgEPDxYCHwBoZGQCCw9kFgJmDw8WAh4LUmVjb3JkY291bnQCm9cBZGRkSZbpfye+kq0gnlNVY/qCTbmTGo+EJYvmiA9Ebk/JB1M=", "__EVENTTARGET": "webPager$WebPager", "__EVENTARGUMENT": page + 1, "__EVENTVALIDATION": "/wEWCgLVsazyDgK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBkcWkK+OppL4wN40dvl6wHiSyXqi0L/fjcNfG32cf6f7", "SProvinceCityArea1$hidP": -2, "SProvinceCityArea1$hidC": -2, "SProvinceCityArea1$hidA": -2, "SProvinceCityArea1$hidPName": None, "SProvinceCityArea1$hidCName": None, "SProvinceCityArea1$hidAName": None, "txtKeyword": "多个目的地用“,”隔开", "webPager$WebPager_input": page, } url = "http://wl.kywmall.com/wl_search/wl_search_car_list.aspx?type=2&province=-1&city=-1&kword=" text = self.httpClient.geturlcon(url, data) soup = BeautifulSoup(text) soup.prettify() hrefs = soup.findAll(name="a", href=re.compile("details")) id_list = [] for href in hrefs: str_url = str(href).split('"') id_list.append(str_url[1].split("=")[1]) return id_list
def getZhuanXianPage(self, page): data = { "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBaoBaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2xpbmVfbGlzdC5hc3B4JTNmdHlwZSUzZDMlMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh4DcmVsBQhub2ZvbGxvd2QCAg8PFgIfAQVraHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWFuYWdlL21iX2J1c2luZXNzLmFzcHgWAh8CBQhub2ZvbGxvd2QCAw8PFgoeBFRleHQFEuafpeivoui/mOasvuiusOW9lR4IQ3NzQ2xhc3MFCm5hdkZvbnRSZWQfAQUiaHR0cDovL2ZxLmt5d21hbGwuY29tL2luZGV4ZnEuYXNweB4GVGFyZ2V0BQZfYmxhbmseBF8hU0ICAhYCHwIFCG5vZm9sbG93ZAIEDw8WBB8BBa0BaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL3JlZ2lzdGVyLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2xpbmVfbGlzdC5hc3B4JTNmdHlwZSUzZDMlMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh8CBQhub2ZvbGxvd2QCBQ8PZBYCHwIFCG5vZm9sbG93ZAIFD2QWBAIJDxYCHgtfIUl0ZW1Db3VudAIKFhZmD2QWAmYPFQQGMzQ1NjY1V+S4k+e6v+iuvuWkh+i/kOi+k++8jOi0p+eJqei/kOi+k++8jOmVv+efremAlOi/kOi+k+eahOeJqea1geS4reW/g2h0dHA6Ly93d3cuZ3p4ZGJxLmNvbQsxMzUzOTk5MDU1MBAyMDE0LTA1LTA2IDIyOjI4ZAIBD2QWAmYPFQQGMzQ1NjI4Uua5luWNl+Wys+mYs+Wys+mYs+alvOWMuiAtPiDlub/kuJzkuJzojp4s5pyJOS4257Gz5YmN5Zub5ZCO5YWr6L2mMei+hizmsYIyN+WQqOi0pzsLMTg2NzMwMTg3MjkQMjAxNC0wNC0xOSAwNzo0NmQCAg9kFgJmDxUEBjM0NTYxNJMC5om/5o6l5LqM5omL5py65qKw44CB5YyW5bel5ZOB5Ye65Y+j77yM5bmz5p2/44CB5byA6aG244CB5pWj5p2C6Ii56K6i6Iix44CCDQrkuI3nrqHkvaDlh7rlj6PnmoTotKfnianmmK/lkKbmnInoh6rlt7HnmoTlh7rlj6PmnYPvvIzmiJHlj7jpg73lj6/ku6XluK7kvaDlronmjpLlh7rlj6Poh7Tlm73lpJbvvIzmiJHlj7jkuJPms6joh7Tlipvkuo7lnKjov5nkuIDpoobln5/vvIzlnKjkv53or4HoiLHkvY3nmoTmg4XlhrXkuIvvvIzmnInnnYDkuLDlr4znmoTmk43kvZznu4/pqozjgIILMTMzMTY1Njg3MDAQMjAxNC0wNC0wNSAxMjowMmQCAw9kFgJmDxUEBjI5OTM4NsAC5pys5YWs5Y+45om/5o6l5oiQ6YO96Iez5YWo5Zu95ZCE5Zyw5pW06L2m6Zu25ouF77yM5aSn5Lu26L+Q6L6T77yM6ZW/6YCU5pCs5a6277yM5LuT5YKo5Y+K5YyF6KOF5pyN5YqhIOaIkeWFrOWPuOmVv+acn+S6q+WPl+mTgemBk+mDqOmXqOeahOS8mOaDoOaUv+etlizku7fmoLzkvr/lrpzvvIzmnI3liqHkuIDmtYHvvIzlronlhajlv6vmjbfvvIzlkIzml7bmj5Dkvpvpl6jliLDpl6jmnI3liqHvvIzlhY3otLnlj5botKfjgILmiJHlhazlj7jlr7npg6jliIbln47luILlkozlnLDljLrov5vooYzmiZPmipjkvJjmg6DvvIzor6bmg4Xor7fmnaXnlLXlkqjor6LjgIILMTgzODIxNTQxMTEQMjAxMy0wOS0wMiAxNjoyOWQCBA9kFgJmDxUEBjI5NTEyNVTmuZbljZcgLT4g5paw55aG5LmM6bKB5pyo6b2QLOacieWbnueoi+W5s+adv+i9pizpq5jmoI/ovabvvIzmsYLotKc7ICAgICDngavovabkuJPnur8NMDczMS0yMjMwNjU4MRAyMDEzLTA4LTIzIDEwOjEyZAIFD2QWAmYPFQQGMjIxNDAwXuaxn+ilv+i1o+W3niAtPiDlub/kuJzmt7HlnLMs5pyJMTHnsbPpm4boo4XnrrHovaY06L6GLOaxgui0pzMwLTUw5ZCo5pyJ5oSP6K+36IGU57O7MTg5NzA3MDUwNTALMTU3NzkwNTYyNTUQMjAxMy0wNC0xNCAxMDoyMWQCBg9kFgJmDxUEBjE5NzUwM0PlsbHkuJzogYrln47pmLPosLfljr8gLT4g5rGf6IuP5peg6ZShLOacieWNiuaMgui9pjLovoYs5rGCNDDlkKjotKc7CzE1NTA2Njk3MDk5EDIwMTMtMDItMjggMTA6MzVkAgcPZBYCZg8VBAYxOTczMjYw5rKz5YyX55+z5a625bqEIC0+IOi0temYs++8jOaYhuaYjuS4k+e6vyzmsYLotKc7CzEzODMxMTMyOTY4EDIwMTMtMDItMjggMDg6NDdkAggPZBYCZg8VBAYxOTY2ODdP6YKi5Y+wLS0tLS0tLeWMl+S6rOWPr+S7peS4k+e6v++8jOacieino+aUvui0p+i9puWQjuWFq+i9ruS4gOi+hu+8jOaApeaxgui0p+a6kAsxNTAyODg4Mjc4MhAyMDEzLTAyLTI2IDE4OjA2ZAIJD2QWAmYPFQQGMTk2NjczMOWwgeS4mOiHs+mDkeW3nuS4rei9rOWPkeW+gOWFqOWbve+8jOaciei9puaxgui0pwsxMzQ2MjI3MDUxMhAyMDEzLTAyLTI2IDE3OjI3ZAIKD2QWAgIBDxYCHwBoFgJmD2QWAgIBDw8WAh8AaGRkAgsPZBYCZg8PFgIeC1JlY29yZGNvdW50As5AZGRkaoQ/PIfuAJZzT/BG4Uc6e5uBZF3WOACxXctYK+dFGzM=", "__EVENTTARGET": "webPager$WebPager", "__EVENTARGUMENT": page + 1, "__EVENTVALIDATION": "/wEWCgKg8oqQCwK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBoJG+x3xGqZFPXmEAWZV+uEXME+5gUPMkTEQ+1TYW6ar", "SProvinceCityArea1$hidP": -2, "SProvinceCityArea1$hidC": -2, "SProvinceCityArea1$hidA": -2, "SProvinceCityArea1$hidPName": None, "SProvinceCityArea1$hidCName": None, "SProvinceCityArea1$hidAName": None, "txtKeyword": "多个目的地用“,”隔开", "webPager$WebPager_input": page, } url = "http://wl.kywmall.com/wl_search/wl_search_line_list.aspx?type=3&province=-1&city=-1&kword=" text = self.httpClient.geturlcon(url, data) soup = BeautifulSoup(text) soup.prettify() hrefs = soup.findAll(name="a", href=re.compile("details")) id_list = [] for href in hrefs: str_url = str(href).split('"') id_list.append(str_url[1].split("=")[1]) return id_list
def getPM25(): url = "http://www.pm25.com/city/wuhan.html" headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0", } try: req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req) content = response.read() response.close() pm = BSoup(content, from_encoding="utf-8") logging.info(pm.select(".citydata_updatetime")[0].get_text() + u" ") with open('pm2dot5.txt', 'a') as f: print>> f, pm.select(".citydata_updatetime")[0].get_text() for locate in pm.select(".pj_area_data ul:nth-of-type(1) li"): print>> f, locate.select(".pjadt_location")[0].get_text().rjust(15), "\t", \ locate.select(".pjadt_aqi")[0].get_text().rjust(15), "\t", \ locate.select(".pjadt_quality")[0].get_text().rjust(15), "\t", \ locate.select(".pjadt_wuranwu")[0].get_text().rjust(15), "\t", \ locate.select(".pjadt_pm25")[0].get_text().rjust(15), "\t", \ locate.select(".pjadt_pm10")[0].get_text().rjust(15) print>> f, "\n\n\n" return 0 except Exception, e: logging.error(e) return 1
def getHuoYuanPage(self, page): data = { "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBa0BaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX3Byb2R1Y3RfbGlzdC5hc3B4JTNmdHlwZSUzZDElMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh4DcmVsBQhub2ZvbGxvd2QCAg8PFgIfAQVraHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWFuYWdlL21iX2J1c2luZXNzLmFzcHgWAh8CBQhub2ZvbGxvd2QCAw8PFgoeBFRleHQFEuafpeivoui/mOasvuiusOW9lR4IQ3NzQ2xhc3MFCm5hdkZvbnRSZWQfAQUiaHR0cDovL2ZxLmt5d21hbGwuY29tL2luZGV4ZnEuYXNweB4GVGFyZ2V0BQZfYmxhbmseBF8hU0ICAhYCHwIFCG5vZm9sbG93ZAIEDw8WBB8BBbABaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL3JlZ2lzdGVyLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX3Byb2R1Y3RfbGlzdC5hc3B4JTNmdHlwZSUzZDElMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh8CBQhub2ZvbGxvd2QCBQ8PZBYCHwIFCG5vZm9sbG93ZAIFD2QWBAIJDxYCHgtfIUl0ZW1Db3VudAIKFhZmD2QWAmYPFQQGMzQ1NjY5bumZleilv+amhuael+WcsOWMuuelnuacqOWOvy0+5rmW5YyX5a6c5piM5a6c5piM5Y6/77yM5pyJ54Wk54KtMTAwMOWQqO+8jOaxguWkmui+hjEz57GzNDDlkKjovabvvIzku7fpq5jmgKXotbA7KDEzMDk4Mjk4MDk0ICAwOTEyLTg0NTM1MzUgIGxpYW5ncm9uZ2tlICAQMjAxNC0wNS0xOSAxMjoxNmQCAQ9kFgJmDxUEBjM0NTY2OFXmsZ/oi4/lrr/ov4EtPuaxn+iLj+a3ruWuieebseecmeWOv++8jOaciemHjei0pzHlkKjvvIzmsYIx6L6GMTXnsbPovabvvIzku7fpq5jmgKXotbA7DTE1MTYxMjI2ODcwICAQMjAxNC0wNS0xOCAxMDo0MWQCAg9kFgJmDxUEBjM0NTY2NzjotKfmupDvvJrmgKXpnIDopoHmsYLmnInlj5HliLDlm57ljrvovabotKfvvIzlpKfovaYxMuexsw0xNTE1MTg1MTcxMCAgEDIwMTQtMDUtMTMgMTk6MDZkAgMPZBYCZg8VBAYzNDU2NjTXAua5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7msrPljJfpgq/pg7jvvIzmnInph43otKcyMDDvvIzmsYIxM+exs+i9pu+8jOS7t+mrmOaApei1sDvmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT4NMTM5NzQ3MTAyODYgIBAyMDE0LTA1LTA2IDExOjMzZAIED2QWAmYPFQQGMzQ1NjYzqgHmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+WkquWOn++8jOacieW7uuadkDM0LTExMO+8jOaxgjEtM+i+hjEz57Gz6L2m77yMODYwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7lsbHopb/lpKrljp/vvIzmnInlu7rmnZAzNC0xMTDvvIzmsYIxLTPovoYxM+exs+i9pu+8jDg2MDDlhYMvOw0xMzk3NDcxMDI4NiAgEDIwMTQtMDUtMDYgMTE6MzJkAgUPZBYCZg8VBAYzNDU2NjLIAua5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7nlJjogoPmrablqIHlnLDljLrvvIzmnInlu7rmnZAzNe+8jOaxgjEz57Gz6L2m77yMMTcwMDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzs6MTM5NzQ3MTAyODYgIDA3MzQtNDIyNDIxMSAgIDEzOTc1NDE3MzY4ICDogIHpm7fotKfov5Dnq5kgIBAyMDE0LTA1LTA2IDExOjI4ZAIGD2QWAmYPFQQGMzQ1NjYx4QHmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzs6MTM5NzQ3MTAyODYgIOiAgembt+i0p+i/kOermSAgMDczNC00MjI0MjExICAgMTM5NzU0MTczNjggIBAyMDE0LTA1LTA2IDExOjI4ZAIHD2QWAmYPFQQGMzQ1NjYw1wLmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7msrPljJfpgq/pg7jvvIzmnInph43otKcyMDDvvIzmsYIxM+exs+i9pu+8jOS7t+mrmOaApei1sDvmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+DTEzOTc0NzEwMjg2ICAQMjAxNC0wNS0wNiAxMDozM2QCCA9kFgJmDxUEBjM0NTY1OaoB5rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7lsbHopb/lpKrljp/vvIzmnInlu7rmnZAzNC0xMTDvvIzmsYIxLTPovoYxM+exs+i9pu+8jDg2MDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5bGx6KW/5aSq5Y6f77yM5pyJ5bu65p2QMzQtMTEw77yM5rGCMS0z6L6GMTPnsbPovabvvIw4NjAw5YWDLzsNMTM5NzQ3MTAyODYgIBAyMDE0LTA1LTA2IDEwOjMyZAIJD2QWAmYPFQQGMzQ1NjU4yALmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7nlJjogoPmrablqIHlnLDljLrvvIzmnInlu7rmnZAzNe+8jOaxgjEz57Gz6L2m77yMMTcwMDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy87OjEzOTc0NzEwMjg2ICAwNzM0LTQyMjQyMTEgICAxMzk3NTQxNzM2OCAg6ICB6Zu36LSn6L+Q56uZICAQMjAxNC0wNS0wNiAxMDoyOGQCCg9kFgICAQ8WAh8AaBYCZg9kFgICAQ8PFgIfAGhkZAILD2QWAmYPDxYCHgtSZWNvcmRjb3VudALYjRFkZGS0MkBs5Z/XXB/pf4OF7cvKZ50NBi3Fx9/BpGgMqe3mUQ==", "__EVENTTARGET": "webPager$WebPager", "__EVENTARGUMENT": page + 1, "__EVENTVALIDATION": "/wEWCgKvz7e5CAK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBuEJlK4dUlfESR5ctz/iPRP2/0Ifmh19XKMdEd06zeOc", "SProvinceCityArea1$hidP": -2, "SProvinceCityArea1$hidC": -2, "SProvinceCityArea1$hidA": -2, "SProvinceCityArea1$hidPName": None, "SProvinceCityArea1$hidCName": None, "SProvinceCityArea1$hidAName": None, "txtKeyword": "多个目的地用“,”隔开", "webPager$WebPager_input": page, } url = "http://wl.kywmall.com/wl_search/wl_search_product_list.aspx?type=1&province=-1&city=-1&kword=" text = self.httpClient.geturlcon(url, data) soup = BeautifulSoup(text) soup.prettify() hrefs = soup.findAll(name="a", href=re.compile("details")) id_list = [] for href in hrefs: str_url = str(href).split('"') id_list.append(str_url[1].split("=")[1]) return id_list
def scrub_html_email(text, cid_mapping={}): from BeautifulSoup import BeautifulSoup soup = BeautifulSoup(text) for tag in soup.findAll(True): attrs = dict(tag.attrs) if 'src' in attrs: src = attrs['src'] if src[:4]=='cid:': tag['src'] = cid_mapping[src[4:]] mapped = soup.renderContents() scrubber = tuit.scrubber.Scrubber(autolink=False) # The scrubber removes complete html documents out of the box? Weird... scrubber.disallowed_tags_save_content.add('html') scrubber.disallowed_tags_save_content.add('body') scrubber.disallowed_tags_save_content.add('xml') scrubber.disallowed_tags_save_content.add('doctype') scrubber.allowed_attributes.add('color') scrubbed = scrubber.scrub(mapped) return scrubbed
def parseWebPageContent(self, html): ''' Парсинг контента: извлечение и анализ ссылок, дополнение очереди string html HTML-код страницы ''' #print envEncode(html) soup = BeautifulSoup(html) for a in soup.findAll('a'): if self.checkIfLinkShouldBeFollowed(a): url = a['href'] # разбиваем ссылку urlsplitResult = urlparse.urlsplit(url) # локальные ссылки не должны включаться # задано доменное имя? if urlsplitResult.scheme == '': scheme = 'http' else: scheme = urlsplitResult.scheme # собираем ссылку обратно url = urlparse.urlunsplit((scheme, self.domain, urlsplitResult.path, urlsplitResult.query, '',)) if url not in self.linksToFollow: self.linksToFollow.append(url)
def __parse(self,result = None): type = result.getheader('content-type') if type == None or 'text' not in type: # return the result object. return (result.url, [], []) # Check to see if this was called from redirect soup = BeautifulSoup(result.read()) urls = [] # These two attributes are going to be the most reliable # TODO: I should let the tag attributes we want to pull urls out of be usr-defined in the fs settings. for tag in soup.findAll(href=True): urls.append(tag['href']) for tag in soup.findAll(src=True): urls.append(tag['src']) # OK, now see if the URL's match the target. # TODO: do we want to check if the url is on the same IP or not? valid_urls = [] for url in urls: v_url = self.__validateURL(url) if v_url is not None: # See if it's not already marked for scanning if v_url not in self.scanned and v_url not in self.to_scan: valid_urls.append(v_url) self.to_scan.append(v_url) print len(self.to_scan) return (result.url,[],valid_urls)
def getList(base_url, list_kind): c = urllib2.urlopen(base_url + list_kind + "?sort=time&start=0&filter=all&mode=grid&tags_sort=count") soup = BeautifulSoup(c.read()) c.close() totalNumber = soup.find("span", {"class": "subject-num"}).contents[0] separator = "/" # print totalNumber totalNumber = totalNumber[totalNumber.find(separator) + 7 :] totalNumber = string.atoi(totalNumber) # print totalNumber bookList = soup.findAll("a", {"class": "nbg"}) # print bookList beginNumber = 15 while totalNumber - 15 > 0: c = urllib2.urlopen( base_url + list_kind + "?sort=time&start=" + str(beginNumber) + "&filter=all&mode=grid&tags_sort=count" ) soup = BeautifulSoup(c.read()) c.close() bookList = bookList + soup.findAll("a", {"class": "nbg"}) totalNumber = totalNumber - 15 beginNumber += 15 return bookList
def GetTable(): wiki = "http://en.wikipedia.org/wiki/List_of_districts_of_Germany" header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia req = urllib2.Request(wiki,headers=header) page = urllib2.urlopen(req) soup = BeautifulSoup(page) table = soup.find("table", { "class" : "wikitable sortable" }) print table #columns of the wikitable # District # Type # Land # Capital T = []; for row in table.findAll("tr"): cells = row.findAll("td") #For each "tr", assign each "td" to a variable. if len(cells) == 4: T.append([ cells[0].find(text=True), cells[1].find(text=True), cells[2].find(text=True), cells[3].find(text=True) ]); #now we have list but we would like to have the coordinates. # We will ask gmap for the coordinates...
def CreateSubPage(self, packageName): if not self.data: self.PrepareData() tab = self.GenerateTab(current = packageName) counter = 0 htmlList = '<table class="directory">\n<tbody>\n' keysI = self.data[packageName].keys() keysI.sort() for i in keysI: if counter % 2 == 0: htmlList += '<tr id="row_%d_" class="even">\n' % counter else: htmlList += '<tr id="row_%d_">\n' % counter htmlList += '<td class="entry">\n<img src="ftv2node.png" alt="o" width="16" height="22">\n' htmlList += '<a class="el" href="%s" target="_self">%s</a>\n' % (self.data[packageName][i], i) htmlList += '</td>\n<td class="desc">\n</td>\n</tr>\n' counter += 1 htmlList += '</tbody>\n</table>\n' temp = copy.deepcopy(self.packageSource) soup = BeautifulSoup(temp) list_ = soup.find('div', { "class" : "directory" }) list_.replaceWith(htmlList) tab_ = soup.find('ul', { "class" : "tablist" }) tab_.replaceWith(tab_.prettify() + tab) data = str(soup.prettify()) self.WriteFile(self.__GetFileName(packageName), data.replace('<','<').replace('>', '>'))
def pharseHYMeta(self, text, id): kv_dic = WTHuoYuan.get_k_v_dic() result = {} result["webSiteId"] = id soup = BeautifulSoup(text) soup.prettify() table = soup.findAll(attrs={"class": "mt10"}) tds = table[0].findAll("td") tds.extend(table[1].findAll("td")) key = None value = None allInfo = {} for td in tds: text = "".join(td.fetchText(True)).strip() if text.endswith(":"): key = text.replace(":", "") else: value = text allInfo[key] = value key = None value = None for k, v in allInfo.iteritems(): if k: result[kv_dic.get(k)] = v return result
def check_mesos_html(mesos_html): ## Find number of cpus from status page html_soup = BeautifulSoup(mesos_html) cpus_str = html_soup.findAll('td')[2].contents[0] mesos_num_cpus = int(cpus_str.strip("CPUs")) print "Mesos master reports " + str(mesos_num_cpus) + " CPUs"
def testCData(self): xml = "<root>foo<![CDATA[foobar]]>bar</root>" self.assertSoupEquals(xml, xml) r = re.compile("foo.*bar") soup = BeautifulSoup(xml) self.assertEquals(soup.find(text=r).string, "foobar") self.assertEquals(soup.find(text=r).__class__, CData)
def testComments(self): xml = "foo<!--foobar-->baz" self.assertSoupEquals(xml) r = re.compile("foo.*bar") soup = BeautifulSoup(xml) self.assertEquals(soup.find(text=r).string, "foobar") self.assertEquals(soup.find(text="foobar").__class__, Comment)
def testSiblings(self): soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>") secondLI = soup.find('li').nextSibling self.assert_(secondLI.name == 'li' and secondLI.string == '2') self.assertEquals(soup.find(text='1').nextSibling.name, 'p') self.assertEquals(soup.find('p').nextSibling, 'B') self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
def linkSpider(url, linkNum, repeatVal): # Loop counter searchRepeat = 0 # Temp holding for links. links = list() # Turns link into searchable soup. html = urllib.urlopen(url).read() soup = BeautifulSoup(html) # Loops as long as the increment variable is less than the requested repeat # value. while searchRepeat < int(repeatVal): # Allows you to update the soup from new links. soup = BeautifulSoup(html) # Loops through current soup looking for links and appends the list # temporarily. for link in soup.findAll("a", href=True): print (link.get("href", None)) links.append(link.get("href", None)) # Helps you see a break between searches. print "Break" # Changes the url to the new url. Offset used to compensate for # difference between computer starting at 0 and humans starting at 1 # when counting objects. newUrl = links[int(linkNum) - 1] html = urllib.urlopen(newUrl).read() # Clears out the list so the new soup can be placed inside and indexed. links[:] = [] searchRepeat += 1 print "Final URL is: ", newUrl
def testSiblings(self): soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>") secondLI = soup.find("li").nextSibling self.assert_(secondLI.name == "li" and secondLI.string == "2") self.assertEquals(soup.find(text="1").nextSibling.name, "p") self.assertEquals(soup.find("p").nextSibling, "B") self.assertEquals(soup.find("p").nextSibling.previousSibling.nextSibling, "B")
def testQuotedAttributeValues(self): self.assertSoupEquals("<foo attr='bar'></foo>", '<foo attr="bar"></foo>') text = """<foo attr='bar "brawls" happen'>a</foo>""" soup = BeautifulSoup(text) self.assertEquals(soup.renderContents(), text) soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>""" self.assertSoupEquals(soup.renderContents(), newText) self.assertSoupEquals('<this is="really messed up & stuff">', '<this is="really messed up & stuff"></this>') # This is not what the original author had in mind, but it's # a legitimate interpretation of what they wrote. self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", '<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>') # SGMLParser generates bogus parse events when attribute values # contain embedded brackets, but at least Beautiful Soup fixes # it up a little. self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
def ask(self): html = self.session.get(self.zhi + '/people/Grapher/asks').text soup = BeautifulSoup(html) ask_gen = ((self.zhi + str(ask.get('href')) + ' ' + str(ask.get_text().encode('utf-8'))) \ for ask in soup.find_all('a', 'question_link')) for ask in ask_gen: print ask
def getMetaData(sites, url): #Instantiate an array of related urls related = [] #Open the url site = BeautifulSoup(str(urllib2.urlopen(url))) #Get all of the meta data keywords meta = site.findAll("meta") #Go through all of the sites for s in sites.all(): #Open that site and get its meta data keywords tmp = BeautifulSoup(urllib2.urlopen(s.url).findAll("meta")) #Go through all of the meta keywords in meta for m in meta: #Go through all of the meta keywords in tmp for t in tmp: #If m == t... if m is t: #It's a related site! related.append(str(s)) return related
def main(): htmlpage = open('TowerVoting.html') soup = BeautifulSoup(htmlpage) people = [] table= soup.find("table") body = table.find("tbody") tds = body.findAll("td") for td in tds: person = {} person["image"] = td.img["src"][21:] person["name"] = td.div.contents[0] people.append(person) for person in people: input = raw_input("Is {} a man?".format(person["name"])) if (input == ""): person["male"] = False else: person["male"] = True output_file = open('members.json', 'w') output = json.dumps(people) output_file.write(output)
def post(): headers = login() thread_ids = [] searchurl = 'http://postfarm.net/search.php?do=getdaily' srchresp, srchcont = http.request(searchurl, 'GET', headers = headers) soup = BeautifulSoup(srchcont) td_list = soup.findAll('td', id=re.compile('td_threadtitle_\d+')) for td in td_list: id = td['id'] match = re.match("td_threadtitle_(\d+)", id) thread_ids.append(match.group(1)) randurl, rand_thread_id = random_url(thread_ids) page = BeautifulSoup(randurl) token_value = str(soup.find('input', attrs={'name':'securitytoken'})['value']) messagebody = {'message':'who wants some free nikes? we make em cheap', 'fromquickreply':'1', 's':'', 'securitytoken':token_value, 'do':'postreply', 't':rand_thread_id, 'p':'who cares', 'parseurl':'1'} resp, cont = http.request(randurl, 'POST', headers=headers, body=urllib.urlencode(messagebody))
def testUnicodePickle(self): import cPickle as pickle html = "<b>" + chr(0xc3) + "</b>" soup = BeautifulSoup(html) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode())
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) post_data = urllib.urlencode( ( ("REGFROMDATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), ("REGTODATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"), )) response = urllib2.urlopen(self.search_url, post_data) contents = response.read() # Let's give scrapers the change to tidy up any rubbish - I'm looking # at you Cannock Chase contents = self._fixHTML(contents) # Check for the no results warning if not contents.count("No Matching Applications Found"): soup = BeautifulSoup.BeautifulSoup(contents) # Get the links to later pages of results. later_pages = soup.findAll( "a", { "href": re.compile( "WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*") }) for a in ["initial_search"] + later_pages: if a != "initial_search": url = a['href'] # Example url #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=<a href=wphappcriteria.display?paSearchKey=147170>Search Criteria</a> # urllib2 doesn't like this url, to make it happy, we'll # get rid of the BackURL parameter, which we don't need. split_url = urlparse.urlsplit(url) qs = split_url[3] # This gets us a dictionary of key to lists of values qsl = cgi.parse_qsl(qs) # Get rid of BackURL qsl.pop(-1) # I think this is safe, as there are no repeats of parameters new_qs = urllib.urlencode(qsl) url = urlparse.urlunsplit(split_url[:3] + (new_qs, ) + split_url[4:]) this_page_url = urlparse.urljoin(self.base_url, url) response = urllib2.urlopen(this_page_url) contents = response.read() soup = BeautifulSoup.BeautifulSoup(contents) results_table = self._findResultsTable( soup) #.body.find("table", {"class": "apas_tbl"}) trs = self._findTRs(results_table) for tr in trs: self._current_application = PlanningApplication() tds = tr.findAll("td") # The first td #<td class="apas_tblContent"><a href="WPHAPPDETAIL.DisplayUrl?theApnID=07/1884&backURL=<a href=wphappcriteria.display?paSearchKey=147125>Search Criteria</a> > <a href='wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=<a href=wphappcriteria.display?paSearchKey=147125>Search Criteria</a>'>Search Results</a>"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a> > <a href="wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=<a href=wphappcriteria.display?paSearchKey=147125>Search Criteria</a>"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a>'>Search Results">07/1884</td> # The html here is a bit of a mess, and doesn't all get into # the soup. # We can get the reference from the first <a href> in td 0. first_link = tds[0].a['href'] app_id = cgi.parse_qs( urlparse.urlsplit(first_link)[3])['theApnID'][0] self._current_application.date_received = search_date self._current_application.council_reference = app_id self._current_application.info_url = self.info_url % ( app_id) self._current_application.comment_url = self.comment_url % ( app_id) self._current_application.description = tds[ 1].string.strip() # the second td #<td class="apas_tblContent"><input type="HIDDEN" name="ORDERCOUNTER.PAHEADER.PACIS2.1-1." value="1" class="input-box" size="7" /> #LAND ADJ. BRAMBLING, HAWKENBURY ROAD, HAWKENBURY, TN120EA #</td> address = ' '.join([ x for x in tds[2].contents if isinstance(x, BeautifulSoup.NavigableString) ]).strip() self._current_application.address = address self._current_application.postcode = getPostcodeFromText( address) self._results.addApplication(self._current_application) return self._results
import BeautifulSoup import requests import os url = 'https://ringzer0team.com/challenges/32/' # challenge url creds = { 'username': '******', # Replace with your creds 'password': '******' # in this dictionary } with requests.session() as s: # Login to site: postreq = s.post('https://ringzer0team.com/login', data=creds) # Go to challenge page now that we're logged in: page = s.get(url).content soup = BeautifulSoup.BeautifulSoup(page) # We know there's just one div result: div = soup.findAll('div', {'class': 'message'})[0] # Extract the text as string and remove useless garbage message = str(div.text)[25:-23] ops = message.split(' ') result = int(ops[0]) + int(ops[2], 16) - int(ops[4], 2) # Construct the URL and open it in the browser (make sure you're logged in in your browser as well) url += str(result) os.startfile(url)
import urllib from BeautifulSoup import * url = raw_input('Enter - ') html = urllib.urlopen(url).read() soup = BeautifulSoup(html) # Retrieve all of the anchor tags tags = soup('span') count = 0 for tag in tags: count += int(tag.contents[0]) print count """ # Look at the parts of a tag print 'TAG:',tag print 'URL:',tag.get('class="comments"', None) print 'Contents:',tag.contents[0] print 'Attrs:',tag.attrs """
def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0): """ Search the given query string using Google. @type query: str @param query: Query string. Must NOT be url-encoded. @type tld: str @param tld: Top level domain. @type lang: str @param lang: Languaje. @type num: int @param num: Number of results per page. @type start: int @param start: First result to retrieve. @type stop: int @param stop: Last result to retrieve. Use C{None} to keep searching forever. @type pause: float @param pause: Lapse to wait between HTTP requests. A lapse too long will make the search slow, but a lapse too short may cause Google to block your IP. Your mileage may vary! @rtype: generator @return: Generator (iterator) that yields found URLs. If the C{stop} parameter is C{None} the iterator will loop forever. """ # Set of hashes for the results found. # This is used to avoid repeated results. hashes = set() # Prepare the search string. query = urllib.quote_plus(query) # Grab the cookie from the home page. get_page(url_home % vars()) # Prepare the URL of the first request. if num == 10: url = url_search % vars() else: url = url_search_num % vars() # Loop until we reach the maximum result, if any (otherwise, loop forever). while not stop or start < stop: # Sleep between requests. time.sleep(pause) # Request the Google Search results page. html = get_page(url) # Parse the response and process every anchored URL. soup = BeautifulSoup.BeautifulSoup(html) anchors = soup.findAll('a') for a in anchors: # Get the URL from the anchor tag. try: link = a['href'] except KeyError: continue # Filter invalid links and links pointing to Google itself. link = filter_result(link) if not link: continue # Discard repeated results. h = hash(link) if h in hashes: continue hashes.add(h) # Yield the result. yield link # Prepare the URL for the next request. start += num if num == 10: url = url_next_page % vars() else: url = url_next_page_num % vars()
def served(self): t = urllib2.urlopen(self.headers["Referer"]) html = t.read() soup = BeautifulSoup.BeautifulSoup(html) body = soup.find(["body"]) return body
# get rice price from CPF the rice's price import urllib from BeautifulSoup import * class bcolors: OKGREEN = '\033[92m' BOLD = '\033[1m' FAIL = '\033[91m' ENDC = '\033[0m' url = 'http://www.cpffeed.com/price_detail.html?product=8' html = urllib.urlopen(url).read() soup = BeautifulSoup(html) price = soup.findAll("td", {"class": "price_product"}) ''' # Calculate price changed last_price = float(price[10].contents[0].encode('utf-8')) cur_price = float(price[3].contents[0].encode('utf-8')) change = cur_price - last_price ''' print "Fetched price from CPF" print "---------------------" for i in xrange(1, len(price), 7): try: last_price = float(price[i + 9].contents[0].encode('utf-8')) cur_price = float(price[i + 2].contents[0].encode('utf-8')) change = cur_price - last_price
__author__ = 'stevenkaplan' from sefaria.model import * from BeautifulSoup import * from sources.functions import * if __name__ == "__main__": contents = BeautifulSoup(open("alt_struct.xml")).contents[2].contents[3] nodes = [] contents = filter(lambda x: type(x) is not NavigableString, contents) for count, each in enumerate(contents): en, he = each.attrs[0][1].split(" / ") node = ArrayMapNode() node.add_primary_titles(en, he) node.depth = 0 if count == 0: node.wholeRef = "Mesillat Yesharim, Introduction" else: node.wholeRef = "Mesillat Yesharim {}".format(count) node.refs = [] nodes.append(node.serialize()) index = get_index("Mesillat Yesharim", server="http://www.sefaria.org") index['alt_structs'] = {"Subject": {"nodes": nodes}} post_index(index, server="http://www.sefaria.org")
from pprint import pprint import BeautifulSoup import requests import re import urllib print "Kranthi" #product url url = 'http://www.amazon.com/dp/B0074R0Z3O' response = requests.get(url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'}) soup = BeautifulSoup.BeautifulSoup(response.content) print soup.find(id="productTitle").string #preparing dic to store info dic={} dic['title'] = soup.find(id="productTitle").string #array of reviews : MOST helpful reviews reviewURLarr=[] for div in soup.findAll(id=re.compile('^rev-dpReviewsMostHelpfulAUI-.*')): for reviewURL in div.findAll('a',{"class": "a-link-normal a-text-normal a-color-base" },href=True): reviewURLarr.append(str(reviewURL['href'])) print len(set(reviewURLarr)) #getting into review url and grabing the data revdic={} for reviewURL in set(reviewURLarr): # print reviewURL reviewResponse = requests.get(reviewURL, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'}) reviewSoup = BeautifulSoup.BeautifulSoup(reviewResponse.content)
import BeautifulSoup import urllib import re u = 'http://www.xfront.com/us_states/' soup = BeautifulSoup.BeautifulSoup(urllib.urlopen(u)) m = re.compile( r'<li>\n<p>Name: [\w ]*</p>\n<p>Capital Name: ([\w ]*)</p>\n<p>Capital Latitude: ([\d\.]*)</p>' ) for a in sorted([m.match(str(x)).groups() for x in soup.findAll('li')], key=lambda x: x[1]): print a[0]
t = t + 1 if t%100 == 0: print str(float(t)/45)+'%' word = word[:-1].split('\t') xq = word[1] word = ''.join(word) #print xq,word params = { 'word':word, 'type':'0', 'pn':'0', 'rn':'10', 'submit':'search' } result = html.get(url, params) soup = BeautifulSoup.BeautifulSoup(result, fromEncoding="gb18030") for ele in soup.findAll('h2'): try: xiaoquName = "" #tmp = unicode(ele.a.strong.contents[0]) for e in ele.a.contents: if not isinstance(e, BeautifulSoup.NavigableString): boldKey = unicode(e.contents[0]) #print boldKey xiaoquName = xiaoquName + boldKey else: key = unicode(e) xiaoquName = xiaoquName + key #print xiaoquName if str(boldKey) == xq: dic[boldKey] = xiaoquName
import urllib import BeautifulSoup inputfile = open("input.txt", "r") courses = [] for line in inputfile: line = line.rstrip() if line == "": continue courses.append(line) inputfile.close() outputfile = open("output.html", "w") outputfile.write("<html><body>") for course in courses: page = urllib.urlopen("http://www.mcgill.ca/study/2011-2012/courses/" + course.replace(" ", "-")) soup = BeautifulSoup.BeautifulSoup(page.read()) page.close() title = soup.findAll('h1') outputfile.write(str(title[1]).replace("h1", "h3")) result = soup.findAll('div', 'content') """Assuming it's the 3rd such div, but this might change!""" outputfile.write(result[2].prettify()) outputfile.write("\n<br><hr>") outputfile.write("</body></html>") outputfile.close()
def download(base): f = open('/usr/share/doc/python/html/c-api/%s.html' % base, 'r') soup = BeautifulSoup.BeautifulSoup(f.read()) return soup.findAll('dl', attrs={'class': 'function'})
kmlStartString = '<?xml version="1.0" encoding="UTF-8"?>\n<kml xmlns="http://www.opengis.net/kml/2.2"\nxmlns:gx="http://www.google.com/kml/ext/2.2">\n' #here's the root node #kmlStartString is the top of the kml file, it has the header info and styling for the placemarks in the tour. kmlStartString += '<Document>\n<StyleMap id="weogeo_logo_map">\n<Pair>\n<key>normal</key>\n<styleUrl>#weogeo_logo</styleUrl>\n</Pair>\n<Pair>\n<key>highlight</key>\n<styleUrl>#weogeo_logo_h</styleUrl>\n</Pair>\n</StyleMap>\n<Style id="weogeo_logo">\n<IconStyle>\n<scale>1.1</scale>\n<Icon>\n<href>http://market.weogeo.com/ge/global/weo_button_small.png</href>\n</Icon>\n<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>\n</IconStyle>\n</Style>\n<Style id="weogeo_logo_h">\n<IconStyle>\n<scale>1.3</scale>\n<Icon>\n<href>http://market.weogeo.com/ge/global/weo_button_small.png</href>\n</Icon>\n<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>\n</IconStyle>\n</Style>\n<name>WeoGeo Kml Tour</name>\n<open>1</open>\n' #kmlTourString is the string that holds the tour information ie: the playlist, etc. kmlTourString = '<gx:Tour>\n<name>WeoGeo Tour</name>\n<gx:Playlist>\n' kmlOverlayString = '' kmlPlacemarkString = '' #here's where we're going to have to add an entry for each map. for fileName in os.listdir('tourKmls'): kmlFile = open(os.path.join('tourKmls', fileName), 'r') #we open a file from tourKmls folder individualKmlString = kmlFile.read() #read contents to string kmlFile.close() #close the file #print "doing " + fileName # debug try: #this block extracts the coordinates, the overlay tags, the names weoSoup = BeautifulSoup.BeautifulSoup(individualKmlString) documentContents = weoSoup.kml.document.contents for tag in documentContents: try: if tag.name == 'name': nameString = tag.string break except: pass overlayString = str(weoSoup.find('groundoverlay')) overlaySoup = BeautifulSoup.BeautifulSoup(overlayString) overlayContents = overlaySoup.contents for tag in overlayContents: if tag.name == 'name': tag.replaceWith('<name>' + nameString + '</name>') overlaySoup.groundoverlay['id'] = fileName
#beautiful soup program import urllib from BeautifulSoup import * site = raw_input("Enter the site: ") handle = urllib.urlopen('http://' + site).read() #handle for beautiful soup handle1 = BeautifulSoup(handle) lst = handle1('script') for tags in lst: print tags.get('src', None)
import urllib from BeautifulSoup import * url = raw_input('Enter url: ') count = int(raw_input('Enter number of times to repeat: ')) position = int(raw_input('Enter position number: ')) links = list() while count >= 0 : html = urllib.urlopen(url).read() formattedHtml = BeautifulSoup(html) tags = formattedHtml('a') for tag in tags : links.append(str(tag.get('href', None))) print 'Retrieving: ' + url # find the url at position - 1 offset for index starting at 0 url = links[position-1] # refresh the links list to the new list of the next url del links[:] count = count - 1
import urllib import BeautifulSoup kwords=raw_input("dose tis lexeis kleidia: ") kwords=kwords.split(",") br1=urllib.urlopen("http://www.brewerydb.com/style/88") beer1=BeautifulSoup(br1.read(),"html.parser") for i in beer1.find_all("class":"description"): d1=i.get_text() for j in keywords: if (beer1[i]==keywords[j]): mx1=mx1+1 br2=urllib.urlopen("http://www.brewerydb.com/style/62") beer2=BeautifulSoup(br2.read(),"html.parser") for i in beer2.find_all("class":"description"): d2=i.get_text() for j in keywords: if (beer2[i]==keywords[j]): mx2=mx2+1 br3=urllib.urlopen("http://www.brewerydb.com/style/119") beer3=BeautifulSoup(br1.read(),"html.parser") for i in beer3.find_all("class":"description"): d3=i.get_text() for j in keywords: if (beer3[i]==keywords[j]): mx3=mx3+1 br4=urllib.urlopen("http://www.brewerydb.com/style/15") beer4=BeautifulSoup(br1.read(),"html.parser") for i in beer4.find_all("class":"description"): d4=i.get_text() for j in keywords: if (beer4[i]==keywords[j]):
url = 'https://onoffmix.com/event/' driver = webdriver.Chrome('/webdriver/chromedriver') driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") driver.execute_script("setMorePrint()") time.sleep(10) #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #time.sleep(10) req = driver.page_source #headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'} #req = requests.get(url, headers=headers) soup = BeautifulSoup.BeautifulSoup(req) h = HTMLParser() soup2 = soup.find('div', attrs={'class': 'contentBox todayEventArea'}) with open('result.html', 'w') as f: f.write("""<!-- C\Code\mysite\elections\templates\elections\index.html --> <!DOCTYPE html> <html lang="en"> <head> <title>강의 목록</title> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script> <script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script> </head>
if __name__ == "__main__": if len(sys.argv) < 2: sys.stderr.write("not enough parameter!\n") sys.exit(1) # initialize variables htmlFullPath = sys.argv[1] htmlFilePath = os.path.split(htmlFullPath)[0] htmlFileName = os.path.split(htmlFullPath)[1] fileNameTemplate = htmlFileName.replace('.html', '_%s.html') # load the html page with open(htmlFullPath) as f: htmlPage = f.read() htmlPage = BeautifulSoup(htmlPage) # please have a look at the pages.html page. You will see that class name # of the related tab, which we will use to put 'index tab' by using this # tab, is different for pages.html file. For namespaces.html (namespace # list) and annotated.html (~class list) files, class names are the same # tabs2. this is why we are setting 'the destination tab class name' up # differently depending on the html file name. if htmlFileName == 'packageDocumentation.html': pages = extractPagesForPackage() destTabClassName = 'tabs' elif htmlFileName == 'configfiles.html': pages = extractPages(configFileFlag=True) destTabClassName = 'tabs2' else: pages = extractPages()
def access_home(self): r = self.opener.open(self.scrape_url) page = r.read() soup = BeautifulSoup.BeautifulSoup(page) self.ICSID = soup.find('input', {'type': 'hidden', 'name': 'ICSID'})['value'] print "Browsing key found: %s" % self.ICSID
def getCoils(self, device): pkgQty = self.getPkgQty(self.devices[device][1]) print 'Getting coils for', device parts = {} params = OrderedDict((('comId', self.devices[device][0]), ('siteId', self.devices[device][1]), ('requestId', self.devices[device][2]))) try: request = self.s.post( "https://fastsolutions.mroadmin.com/Apex-Device/devicePOGAction_detailPOG.action", params=params) except: request = self.relog( "https://fastsolutions.mroadmin.com/Apex-Device/devicePOGAction_detailPOG.action", params) soup = BeautifulSoup.BeautifulSoup(request.content) self.branch = str( soup.find('div', { 'id': 'head_company_name' }).contents[0][-5:]) coils = int(soup.find('input', {'id': 'deviceBinCount'})['value']) lockers = int(soup.find('input', {'id': 'lockersCount'})['value']) for i in range(coils): row = soup.find('tr', {'id': 'tr' + str(i)}) description = str( self.html_parser.unescape(row.contents[3].contents[0].strip())) if row.contents[5].contents[1].contents: SKU = re.search( '[\d\w\[\]-]+', str(row.contents[5].contents[1].contents[0].strip())) QTY = re.search('[\d.]+', str(row.contents[7].contents[0])) MAX = re.search('[\d.]+', str(row.contents[11].contents[0])) MIN = re.search('[\d.]+', str(row.contents[13].contents[0])) SKU, QTY, MAX, MIN = SKU.group(), int(float(QTY.group())), int( float(MAX.group())), int(float(MIN.group())) QTY, MAX, MIN = map(lambda x: x * pkgQty[SKU], [QTY, MAX, MIN]) if SKU in parts: QTY, MAX, MIN = map(lambda x: x[1] + parts[SKU][x[0]], [('QTY', QTY), ('MAX', MAX), ('MIN', MIN)]) parts[SKU] = { 'description': description, 'QTY': QTY, 'MAX': MAX, 'MIN': MIN } else: print 'Data missing from position ' + str( int(float(row.contents[1].contents[0]))) for i in range(lockers): row = soup.find('tr', {'id': 'lockerTr' + str(i)}) description = str( self.html_parser.unescape(row.contents[3].contents[0].strip())) if row.contents[5].contents[1].contents: SKU = re.search( '[\d\w\[\]-]+', str(row.contents[5].contents[1].contents[0].strip())) QTY = re.search('[\d.]+', str(row.contents[7].contents[0])) MAX = re.search('[\d.]+', str(row.contents[9].contents[0])) MIN = re.search('[\d.]+', str(row.contents[11].contents[0])) SKU, QTY, MAX, MIN = SKU.group(), int(float(QTY.group())), int( float(MAX.group())), int(float(MIN.group())) QTY, MAX, MIN = map(lambda x: x * pkgQty[SKU], [QTY, MAX, MIN]) if SKU in parts: QTY, MAX, MIN = map(lambda x: x[1] + parts[SKU][x[0]], [('QTY', QTY), ('MAX', MAX), ('MIN', MIN)]) parts[SKU] = { 'description': description, 'QTY': QTY, 'MAX': MAX, 'MIN': MIN } else: print 'Data missing from position ' + str( int(float(row.contents[1].contents[0]))) return parts
excel = "C:\Users\succful\Desktop\\nicetest.xlsx" os.popen("md D:\\Download\>nul 2>nul") browser = spynner.Browser() browser.create_webview() browser.set_html_parser(pyquery.PyQuery) #browser.load("https://support.hp.com/us-en/drivers/selfservice/hp-elite-slice/12710078",load_timeout=120) #browser.load("https://support.hp.com/cn-zh/drivers/selfservice/hp-elite-slice/12710078",load_timeout=120) #open("Test.html", 'w+').write(browser.html.encode("utf-8")) f = open("D:\Download\Test.html", 'w+') f.write(browser.html.encode("utf-8")) f.close() #browser.close() soups = BeautifulSoup.BeautifulSoup(open("D:\Download\Test.html")) tag_button = soups.findAll( 'button', {"class": "hidden-lg button button-sm primary hpdiaButton"}) num = len(tag_button) ''' ntfile = open("ntname.txt","r") ntNames = ntfile.readlines() ntfile.close() ntName=ntName.strip("\n") ''' data = xlrd.open_workbook(excel) table = data.sheet_by_name(u'sheet') nrows = table.nrows for i in range(1, nrows): ntName = table.cell(i, 0).value i = 0
def get_home(self): r = self.opener.open(self.scrape_url) page = r.read() return BeautifulSoup.BeautifulSoup(page)
def LISTMOVIES(murl, name, index, page=1): turl = murl totalMoviesToLoad = settings.getNoOfMoviesToLoad() dialogWait = xbmcgui.DialogProgress() ret = dialogWait.create('Please wait until [Movies] are cached.') loadedLinks = 0 totalLinks = totalMoviesToLoad remaining_display = 'Movies loaded :: [B]' + str( loadedLinks) + ' / ' + str(totalLinks) + '[/B].' dialogWait.update(0, '[B]Will load instantly from now on[/B]', remaining_display) xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)") quality = None hindiMovie = False year = None pagesScanned = 0 while ((pagesScanned < 5) and (loadedLinks <= totalMoviesToLoad)): purl = turl if int(page) > 1: purl = turl + "?paged=" + str(page) link = main.OPENURL(purl) soup = BeautifulSoup.BeautifulSoup(link).findAll('item') for item in soup: quality = '' hindiMovie = False year = '' name = item.title.text url = item.comments.text.replace('#comments', '') for category in item.findAll('category'): if category.text == 'Hindi Movies': #print item hindiMovie = True elif re.search('DVD', category.text, flags=re.I): quality = ' [COLOR red][DVD][/COLOR] ' elif re.search('/*BluRay/*', category.text, flags=re.I): quality = ' [COLOR red][HD][/COLOR] ' elif re.search('[1-2][0,9][0-9][0-9]', category.text, flags=re.I): year = category.text if dialogWait.iscanceled(): return False if dialogWait.iscanceled(): return False if hindiMovie: pagesScanned = 0 main.addDirX(name + quality, url, constants.SOMINAL_LOADVIDEOS, '', searchMeta=True, metaType='Movies', year=year) loadedLinks = loadedLinks + 1 percent = (loadedLinks * 100) / totalLinks remaining_display = 'Movies loaded :: [B]' + str( loadedLinks) + ' / ' + str(totalLinks) + '[/B].' dialogWait.update(percent, '[B]Will load instantly from now on[/B]', remaining_display) if loadedLinks >= totalLinks: print 'BREAKING' break if dialogWait.iscanceled(): return False if dialogWait.iscanceled(): return False page = str(int(page) + 1) pagesScanned = pagesScanned + 1 dialogWait.close() del dialogWait main.addDir('[COLOR blue]Next[/COLOR]', murl, constants.SOMINAL_LISTMOVIES, art + '/next.png', index=index, page=str(page)) xbmcplugin.setContent(int(sys.argv[1]), 'Movies') main.setSeasonView()
def _confirm_long_listing(self, ICStateNum): # basically, press "yes" to confirm showing more than 100 results r = self.opener.open(self.scrape_url) page = r.read() return BeautifulSoup.BeautifulSoup(page)
''' RE_TM = re.compile('Theresa May') RE_LF = re.compile('Lynne Featherstone') TM = 'Rt Hon Theresa May MP' LF = 'Lynne Featherstone MP' baseurl = 'http://www.equalities.gov.uk' baselink = '/ministers/speeches-1.aspx' # get speeches speechlinks = [] html = scraperwiki.scrape(baseurl + baselink) soup = BeautifulSoup.BeautifulSoup(html) table = soup.find('table', {'summary': 'Speeches'}) data = table.findAll('td') for contents in data: a = contents.find('a') if a: speechlinks.append(a['href']) for link in speechlinks: if link.split('.')[-1] == 'doc': continue record = {} link = link.replace(u'\u2019', u'%92') link = baseurl + link print link record['department'] = 'Government Equalities Office' record['permalink'] = link
#-*- coding: utf-8 -*- import urlopen as urlopen import bs4 as BeautifulSoup url = "https://www.rottentomatoes.com/" html = urlopen(url) source = html.read() # 바이트코드 type으로 소스를 읽는다. html.close() # urlopen을 진행한 후에는 close를 한다. soup = BeautifulSoup( source, "html5lib" ) # 파싱할 문서를 BeautifulSoup 클래스의 생성자에 넘겨주어 문서 개체를 생성, 관습적으로 soup 이라 부름 table = soup.find(id="Top-Box-Office") movies = table.find_all(class_="middle_col") for movie in movies: title = movie.get_text() print(title) link = movie.a.get('href') url = 'https://www.rottentomatoes.com' + link print(url)
import urllib import BeautifulSoup html = '<div><span><a href=http://naver.com>naver.com</a></span></div>' soup = BeautifulSoup.BeautifulSoup(html) print soup.prettify() data = urllib.urlopen( 'http://comic.naver.com/webtoon/list.nhn?titleId=20853&weekday=fri') soup = BeautifulSoup.BeautifulSoup(data) cartoons = soup.findAll('td', attrs={'class': 'title'}) title = cartoons[0].find('a').text link = cartoons[0].find('a')['href'] print soup.prettify() print title, link ''' print cartoons[1].find('a').text print cartoons[0] '''
hash_sha1 = hashlib.sha1(hash_cmd).hexdigest() hash_sha256 = hashlib.sha256(hash_cmd).hexdigest() self.SendMsg(canal, banner + '0,1[4 HASH 0]14 MD5 4=>15 {}'.format(str(hash_md5))) self.SendMsg(canal, banner + '0,1[4 HASH 0]14 SHA1 4=>15 {}'.format(str(hash_sha1))) self.SendMsg(canal, banner + '0,1[4 HASH 0]14 SHA256 4=>15 {}'.format(str(hash_sha256))) except: self.SendMsg(canal, banner + '0,1 Por favor, use:15 ' + self.prefix + 'hash <senha>') if command[0] == 'hashkill': try: hashkill_hash = command[1] self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]14 Checando:15 {} '.format(str(hashkill_hash))) r = requests.get('http://hashtoolkit.com/reverse-hash?hash=' + str(hashkill_hash)) soup = BeautifulSoup.BeautifulSoup(r.text) hash_type = soup.tbody.td.text hashkill_list = [] for result in soup.findAll('td', {'class':'res-text'}): hashkill_resolved = result.span.text hashkill_list.append(str(hashkill_resolved)) if len(hashkill_list) == 1: for hashes in hashkill_list: self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]14 Encontrado 4=>9 {} 4=>14 ({}) '.format(str(hashes), str(hash_type))) else: self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]4 Hash não encontrada. ') except: self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]4 Hash não encontrada. ') if command[0] == 'hex':
def get_historical_low_price_from_sina_history(html_info): soup = BeautifulSoup.BeautifulSoup(html_info) data = soup.findAll('tr')[1:] price_list = [float(i.findAll('td')[8].text) for i in data] return list(reversed(price_list))
__name__ = "vutsuak" import urllib2 import BeautifulSoup as bs4 site = "http://www.cnn.com/2016/02/10/world/vancouver-island-human-foot/index.html" hdr = {'User-Agent': 'Mozilla/5.0'} req = urllib2.Request(site, headers=hdr) page = urllib2.urlopen(req) html = urllib2.urlopen(site) soup = bs4.BeautifulSoup(html) content = "" author = "" date = "" t = soup.title.string.split(" ")[0:6] title = "" for i in range(len(t)): title += t[i] + " " l = [] for node in soup.findAll('p'): l.append(''.join(node.findAll(text=True))) for i in range(7, len(l)): content += l[i] + "\n" author = l[3] date = l[5] print title print author print date print content