コード例 #1
0
ファイル: linux_outlaws_cmml.py プロジェクト: elelay/Misc
def create_cmml(html, ogg_file):
    soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
    time_re  = text=re.compile("\\d{1,2}(:\\d{2}){2}")
    times = soup.findAll(text=time_re)
    if len(times) > 0:
        m = re.match('(.*)\\.[^\\.]+$',ogg_file)
        if m is not None:
            to_file = m.group(1) + ".cmml"
            cmml = ET.Element('cmml',attrib={'lang':'en'})
            remove_ws = re.compile('\s+')
            for t in times:
                txt = ''
                for c in t.parent.findAll(text=True):
                    if c is not t: txt += c
                txt = remove_ws.sub(' ', txt)
                txt = txt.strip()
                log("found chapter %s at %s"%(txt,t))
                # totem want's escaped html in the title attribute (not & but &)
                txt = txt.replace('&','&')
                clip = ET.Element('clip')
                clip.set('id',t)
                clip.set( 'start', ('npt:'+t))
                clip.set('title',txt)
                cmml.append(clip)
            ET.ElementTree(cmml).write(to_file,encoding='utf-8')
コード例 #2
0
ファイル: tkParse_spider.py プロジェクト: synckey/logistics
    def pharseContact(self, text):
        kv_dic = TK56Contact.get_k_v_dic()
        result = {}
        soup = BeautifulSoup(text)
        soup.prettify()
        table = soup.findAll(attrs={"class": "st-tab"})[1]
        tds = table.findAll("td")
        allInfo = {}
        for info in tds:
            info = ("".join(info.fetchText(True))).strip().replace(" ", "")
            if not info:
                continue
            key = info.split(":")[0]
            value = info.replace(key + ":", "")
            allInfo[key] = value
        contacts = None
        for k, v in allInfo.items():
            if k.startswith("联系方式"):
                if contacts:
                    contacts = contacts + "," + v
                else:
                    contacts = v
                allInfo.pop(k)
        allInfo[u"联系方式"] = contacts

        for k, v in allInfo.iteritems():
            if k:
                result[kv_dic.get(k)] = v
        return result
コード例 #3
0
ファイル: Authenticator.py プロジェクト: djezuz/ibot2
	def login(self):
		"""
		Perform the actual login. This method takes the username and password
		passed in when the class was initialized. It then creates a dictionary
		with login information. This dictionary is passed into urllib2 to create
		a Request, which is then passed to ClientCookie.urlopen. This method
		returns a loginResponse, which is the source code from the default
		Iodine module.
		"""
		try: # Just in case we're trying to run without an Internet connection or something
			usernameKey = 'login_username' # Defines the username field name
			passwordKey = 'login_password' # Defines the password field name
			loginUrl = "https://iodine.tjhsst.edu" # Defines the URL that the request will use
			loginInformation = {usernameKey: self.username, passwordKey: self.password} # Creates a request dictionary
			loginInformation = urllib.urlencode(loginInformation) # Encode the login information.
			loginRequest = urllib2.Request(loginUrl, loginInformation) # Creates a Request that is used to login
			loginResponse = ClientCookie.urlopen(loginRequest) # Sends the login to Iodine and stores the PHP session ID.
			loginResponse = loginResponse.read() # Get the HTML/XML from Iodine.
			webpage = BeautifulSoup(loginResponse) # Set up a Beautiful Soup object
			eighthChangeUrl = webpage.find(id="menu_eighth")['href'] # Grab the eighth period change URL
			uid = eighthChangeUrl.split("uid/")[1] # Get the UID based on the eighth period change URL
			self.uid = uid # And set the uid as a class variable, effectively getting the UID for changing things
			self.isAuthenticated = True # Yes, yes we are logged in.
			return True # Yay, no error!
		except Exception, e: # If we failed for whatever reason...
			self.uid = None # Set the uid to none.
			self.isAuthenticated = False # No, no we are not.
			print e
			raise Exception("Error in Authenticator: could not log in.") # Raise an exception.
			raise IodineException("Error in Authenticator: could not log in.", "ERR_AUTHENTICATE_LOGIN") # Raise an IodineException
コード例 #4
0
class AllChannels:
    """
        This is the class scrap the BASE_URL!
    """

    def __init__(self):
        self.soup = BeautifulSoup(urllib.urlopen(ALL_CHANNELS_URL))

    def getChannels(self):
        channels = []
        ch = {}
        for c, div in enumerate(self.soup.findAll("div", {"class": re.compile(r'two columns.*')})):
            ch["link" + str(c)] = div.a['href']
            ch["name" + str(c)] = div.find("a").text
            ch["title" + str(c)] = div.a['title']
            channels.append(ch)
        return channels

    def downloadIcons(self):
        if not os.path.exists("icons"):
            os.mkdir("icons")
        for div in self.soup.findAll("div", {"class": re.compile(r'two columns.*')}):
            channel_url = div.a['href']
            icon_url = div.find("img")["src"]
            if not os.path.isfile("icons" + os.sep + str(channel_url + ".png")):
                try:
                    urllib.urlretrieve(str(BASE_URL + icon_url), "icons" + os.sep + str(channel_url + ".png"))
                except:
                    pass
コード例 #5
0
ファイル: ip_locate.py プロジェクト: mactanxin/ip_locate
def get_ip_locate(ip_address):
    errf = open(err_output_file,'a')
    opener = build_opener()
    result_info = ''
    avaliable_list = []
    err_list = []
    result_list = []
    url = 'http://www.ip138.com/ips.asp?ip=%s&action=2' %ip_address
    page = opener.open(url).read()
    soup = BeautifulSoup(page)
    a = soup.findAll('table')[2].findAll("tr")[2].findAll('li')
    for i in a:
        k = i.string
        try:
            j = get_location_info_from_nodist(k,ip_address)
            result_list.append(j)
        except:
            err_address = "%s|%s"%(k,ip_address)
            errf.write("%s\n"%err_address.encode('utf-8'))
    if '省' in  result_list[0] or '市' in result_list[0] or '区' in result_list[0]:                
        result_info = result_list[0]
    elif '省' not in  result_list[0] or '市' not in result_list[0]:
        for i in result_list:
            if '省' in i or '市' in i:
                avaliable_list.append(i)
                result_info = avaliable_list[0]
    return result_info
コード例 #6
0
ファイル: getOscarData.py プロジェクト: moontails/PROM
def process_content(content,award_type):
	# Need to process content and return a hash of the form:
	# {"type" : "actor", "win_type":True, "name":"some_name","movie":"movie name"}
	acadamy_awards = []
	soup = BeautifulSoup(content)
	all_divs = soup.find_all('div',class_='nomHangIndent')
	all_trs = []
	for div in all_divs:
		all_trs.append(div.parent.parent)
	# print all_trs
	for tr in all_trs:
		temp_row = {}
		st, win_star, person_details  = tr.contents
		div, st = person_details.contents
		hrefs = div.find_all('a')
		for href in hrefs:
			if href.parent.name == 'div':
				name = href.string
				name = re.sub('Written| by|Screenplay|Original|Story','',name)
				name = re.sub('&|;| and',',',name)
				name = re.sub('\[[0-9]*\]|\[|\]|\'|novel|see Cast|original screenplay|Adapted for the screen|','',name)
				name = re.sub(r'\\n',',',name)
				name = re.sub(',\s*,',',',name)
				name = re.sub(', Jr.',' Jr.',name)
		for tag in tr.parent.parent.previous_elements:
			if tag.name == 'dt':
				dt = tag
				try:
					year = dt.u.a.string
				except Exception, e:
					year = dt.a.string
				finally:
					year = re.sub(" \([0-9]+[a-z]+\)",'',year)
コード例 #7
0
def rottentomatoScraper(movieData):
	query = urllib.urlencode ({'q' : movieData + " rotten tomatoes"})
	jsonObj = None
	# while jsonObj['responseData'] == None:
	response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
	jsonObj = json.loads(response)
		# try:
	result = jsonObj['responseData']['results']
			# print("Hey")
		# except:
		# 	pass
	url = result[0]['url']
	r = urllib.urlopen(url).read()
	soup = BeautifulSoup(r)
	#add image, cast members, genre, reviews
	criticConsensus = soup.find("p", {"class": "critic_consensus superPageFontColor"}).contents[2].strip()
	criticRating = soup.find("span", {'itemprop': "ratingValue"}).contents[0].strip()
	userRating = soup.find("span", {'itemprop': "ratingValue"}).contents[0].strip()
	director = soup.find("span", {"itemprop": "name"}).contents[0].strip()
	cast = [span.contents[0].strip() for span in soup.findAll("span", {'itemprop': "name"})[1:4]]
	genres = [span.contents[0].strip() for span in soup.findAll("span", {'itemprop': "genre"})]
	image = soup.find("img", {"class": " posterImage"})["src"]

	r = urllib.urlopen(url + "/reviews/?type=top_critics").read()
	soup = BeautifulSoup(r)
	reviews = [div.contents[0].strip() for div in soup.findAll("div", {"class": "the_review"})[:6]]

	return {"criticConsensus": criticConsensus, "criticRating": criticRating,
	"userRating": userRating, "cast": cast, "image": image, "reviews": reviews,
	"director": director, "genres": genres}
コード例 #8
0
 def getCheYuanPage(self, page):
     data = {
         "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBakBaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2Nhcl9saXN0LmFzcHglM2Z0eXBlJTNkMiUyNnByb3ZpbmNlJTNkLTElMjZjaXR5JTNkLTElMjZrd29yZCUzZB8AZxYCHgNyZWwFCG5vZm9sbG93ZAICDw8WAh8BBWtodHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tZW1iZXIvbG9naW4uYXNweD9nb3VybD1odHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tYW5hZ2UvbWJfYnVzaW5lc3MuYXNweBYCHwIFCG5vZm9sbG93ZAIDDw8WCh4EVGV4dAUS5p+l6K+i6L+Y5qy+6K6w5b2VHghDc3NDbGFzcwUKbmF2Rm9udFJlZB8BBSJodHRwOi8vZnEua3l3bWFsbC5jb20vaW5kZXhmcS5hc3B4HgZUYXJnZXQFBl9ibGFuax4EXyFTQgICFgIfAgUIbm9mb2xsb3dkAgQPDxYEHwEFrAFodHRwOi8vd3d3Lmt5d21hbGwuY29tL21lbWJlci9tZW1iZXIvcmVnaXN0ZXIuYXNweD9nb3VybD1odHRwJTNhJTJmJTJmd2wua3l3bWFsbC5jb20lMmZ3bF9zZWFyY2glMmZ3bF9zZWFyY2hfY2FyX2xpc3QuYXNweCUzZnR5cGUlM2QyJTI2cHJvdmluY2UlM2QtMSUyNmNpdHklM2QtMSUyNmt3b3JkJTNkHwBnFgIfAgUIbm9mb2xsb3dkAgUPD2QWAh8CBQhub2ZvbGxvd2QCBQ9kFgQCCQ8WAh4LXyFJdGVtQ291bnQCChYWZg9kFgJmDxUEBjM0NTY2Ng/nlr7otbDvvIzlv6vpgJ8LMTM1MjkyOTA3OTUQMjAxNC0wNS0wOSAwODoyM2QCAQ9kFgJmDxUEBjM0NTY1NVTmsrPljZfpg5Hlt57kuozkuIPljLotPuays+WNl+mDkeW3nu+8jOaciTHovoY057GzM+WQqOi9pu+8jOaxguaVtOi9pu+8jOS7t+agvOmdouiurjsNMTM2MzM4NjgxOTUgIBAyMDE0LTA1LTA1IDE0OjUxZAICD2QWAmYPFQQGMzQ1NjMwN+i+veWugeayiOmYsy0+6L695a6B5rKI6Ziz77yM5pyJMei+hjQuMuexs+i9pu+8jOaxgui0pzsNMTMzODY4NjMwMDggIBAyMDE0LTA0LTI0IDA3OjUwZAIDD2QWAmYPFQQGMzQ1NjI3sgHmuZbljZflsrPpmLPkuLTmuZjluIIgLT4g5bm/5Lic5bm/5bee55m95LqR5Yy6LOacieWbnueoizkuNuexs+WJjeWbm+WQjuWFq+i9pjPovoYs5rGC6LSnO+a5luWNl+Wys+mYs+S4tOa5mOW4giAtPiDlub/kuJzlub/lt57nmb3kupHljLos5pyJ5Zue56iLOS4257Gz5YmN5Zub5ZCO5YWr6L2mM+i+hizmsYLotKc7CzE4NjczMDE4NzI5EDIwMTQtMDQtMTkgMDc6NDNkAgQPZBYCZg8VBAYzNDU2MjZF5rGC5LiT57q/6LSn77yM6L+Q6LS55Y+v5Lul5ZWG6YeP44CC6YWN6LSn6LS55aW96K+077yM5pyJ5Zue6LSn5pu05aW9CzEzMzYzNzM4MDg4EDIwMTQtMDQtMTcgMTg6MzdkAgUPZBYCZg8VBAYzNDU2MDKfAeaIkeacieS4pOi+huWQjuWFq+i9rjYuMuWbnueoi+i9puWOu+aWsOeWhuWcsOWMuuWuieW+veS6s+W3nuiwr+WfjuWMui0+5paw55aG5LmM6bKB5pyo6b2Q5paw5biC5Yy677yM5pyJMui+hjYuMuexszEw5ZCo6L2m77yM5rGC6K6+5aSH6ZKi5p2Q5ZCo77yM5Lu35qC86Z2i6K6uOw0xODA1Njc2MTYyNyAgEDIwMTQtMDMtMjQgMTk6NTJkAgYPZBYCZg8VBAYzNDU1OTRN5rKz5YyX6YKi5Y+w5a6B5pmL5Y6/LT7msrPljZfpg5Hlt57vvIzmnIkxMuexs+i9pu+8jOaxguaVtOi9pu+8jOS7t+agvOmdouiurjsNMTg4MzE5NTgyNjAgIBAyMDE0LTAzLTIyIDE1OjQ5ZAIHD2QWAmYPFQQGMzQ1NTc5XOWNiuaMgui9pu+8jDEz57Gz6ZW/77yMNDDlkKjvvIzluLjlubTot5Hov5DovpPvvIznlLXor53vvZ4xODkwMzkxODI3Nuays+WNl+eEpuS9nC0+6ZmV6KW/77yMDTE1OTkzNzI2NzcwICAQMjAxNC0wMy0yMCAxODozOGQCCA9kFgJmDxUEBjM0NTU1NwnljYrmjILovaYNMTU5OTM3MjY3NzAgIBAyMDE0LTAzLTEwIDEwOjI3ZAIJD2QWAmYPFQQGMzQ1NTU2DeS7t+agvOmdouiuriAbMTg2MDU3NTU3NDYgIOW/hei+vui0p+i/kCAgEDIwMTQtMDMtMDggMTg6MTBkAgoPZBYCAgEPFgIfAGgWAmYPZBYCAgEPDxYCHwBoZGQCCw9kFgJmDw8WAh4LUmVjb3JkY291bnQCm9cBZGRkSZbpfye+kq0gnlNVY/qCTbmTGo+EJYvmiA9Ebk/JB1M=",
         "__EVENTTARGET": "webPager$WebPager",
         "__EVENTARGUMENT": page + 1,
         "__EVENTVALIDATION": "/wEWCgLVsazyDgK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBkcWkK+OppL4wN40dvl6wHiSyXqi0L/fjcNfG32cf6f7",
         "SProvinceCityArea1$hidP": -2,
         "SProvinceCityArea1$hidC": -2,
         "SProvinceCityArea1$hidA": -2,
         "SProvinceCityArea1$hidPName": None,
         "SProvinceCityArea1$hidCName": None,
         "SProvinceCityArea1$hidAName": None,
         "txtKeyword": "多个目的地用“,”隔开",
         "webPager$WebPager_input": page,
     }
     url = "http://wl.kywmall.com/wl_search/wl_search_car_list.aspx?type=2&province=-1&city=-1&kword="
     text = self.httpClient.geturlcon(url, data)
     soup = BeautifulSoup(text)
     soup.prettify()
     hrefs = soup.findAll(name="a", href=re.compile("details"))
     id_list = []
     for href in hrefs:
         str_url = str(href).split('"')
         id_list.append(str_url[1].split("=")[1])
     return id_list
コード例 #9
0
 def getZhuanXianPage(self, page):
     data = {
         "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBaoBaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2xpbmVfbGlzdC5hc3B4JTNmdHlwZSUzZDMlMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh4DcmVsBQhub2ZvbGxvd2QCAg8PFgIfAQVraHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWFuYWdlL21iX2J1c2luZXNzLmFzcHgWAh8CBQhub2ZvbGxvd2QCAw8PFgoeBFRleHQFEuafpeivoui/mOasvuiusOW9lR4IQ3NzQ2xhc3MFCm5hdkZvbnRSZWQfAQUiaHR0cDovL2ZxLmt5d21hbGwuY29tL2luZGV4ZnEuYXNweB4GVGFyZ2V0BQZfYmxhbmseBF8hU0ICAhYCHwIFCG5vZm9sbG93ZAIEDw8WBB8BBa0BaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL3JlZ2lzdGVyLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX2xpbmVfbGlzdC5hc3B4JTNmdHlwZSUzZDMlMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh8CBQhub2ZvbGxvd2QCBQ8PZBYCHwIFCG5vZm9sbG93ZAIFD2QWBAIJDxYCHgtfIUl0ZW1Db3VudAIKFhZmD2QWAmYPFQQGMzQ1NjY1V+S4k+e6v+iuvuWkh+i/kOi+k++8jOi0p+eJqei/kOi+k++8jOmVv+efremAlOi/kOi+k+eahOeJqea1geS4reW/g2h0dHA6Ly93d3cuZ3p4ZGJxLmNvbQsxMzUzOTk5MDU1MBAyMDE0LTA1LTA2IDIyOjI4ZAIBD2QWAmYPFQQGMzQ1NjI4Uua5luWNl+Wys+mYs+Wys+mYs+alvOWMuiAtPiDlub/kuJzkuJzojp4s5pyJOS4257Gz5YmN5Zub5ZCO5YWr6L2mMei+hizmsYIyN+WQqOi0pzsLMTg2NzMwMTg3MjkQMjAxNC0wNC0xOSAwNzo0NmQCAg9kFgJmDxUEBjM0NTYxNJMC5om/5o6l5LqM5omL5py65qKw44CB5YyW5bel5ZOB5Ye65Y+j77yM5bmz5p2/44CB5byA6aG244CB5pWj5p2C6Ii56K6i6Iix44CCDQrkuI3nrqHkvaDlh7rlj6PnmoTotKfnianmmK/lkKbmnInoh6rlt7HnmoTlh7rlj6PmnYPvvIzmiJHlj7jpg73lj6/ku6XluK7kvaDlronmjpLlh7rlj6Poh7Tlm73lpJbvvIzmiJHlj7jkuJPms6joh7Tlipvkuo7lnKjov5nkuIDpoobln5/vvIzlnKjkv53or4HoiLHkvY3nmoTmg4XlhrXkuIvvvIzmnInnnYDkuLDlr4znmoTmk43kvZznu4/pqozjgIILMTMzMTY1Njg3MDAQMjAxNC0wNC0wNSAxMjowMmQCAw9kFgJmDxUEBjI5OTM4NsAC5pys5YWs5Y+45om/5o6l5oiQ6YO96Iez5YWo5Zu95ZCE5Zyw5pW06L2m6Zu25ouF77yM5aSn5Lu26L+Q6L6T77yM6ZW/6YCU5pCs5a6277yM5LuT5YKo5Y+K5YyF6KOF5pyN5YqhIOaIkeWFrOWPuOmVv+acn+S6q+WPl+mTgemBk+mDqOmXqOeahOS8mOaDoOaUv+etlizku7fmoLzkvr/lrpzvvIzmnI3liqHkuIDmtYHvvIzlronlhajlv6vmjbfvvIzlkIzml7bmj5Dkvpvpl6jliLDpl6jmnI3liqHvvIzlhY3otLnlj5botKfjgILmiJHlhazlj7jlr7npg6jliIbln47luILlkozlnLDljLrov5vooYzmiZPmipjkvJjmg6DvvIzor6bmg4Xor7fmnaXnlLXlkqjor6LjgIILMTgzODIxNTQxMTEQMjAxMy0wOS0wMiAxNjoyOWQCBA9kFgJmDxUEBjI5NTEyNVTmuZbljZcgLT4g5paw55aG5LmM6bKB5pyo6b2QLOacieWbnueoi+W5s+adv+i9pizpq5jmoI/ovabvvIzmsYLotKc7ICAgICDngavovabkuJPnur8NMDczMS0yMjMwNjU4MRAyMDEzLTA4LTIzIDEwOjEyZAIFD2QWAmYPFQQGMjIxNDAwXuaxn+ilv+i1o+W3niAtPiDlub/kuJzmt7HlnLMs5pyJMTHnsbPpm4boo4XnrrHovaY06L6GLOaxgui0pzMwLTUw5ZCo5pyJ5oSP6K+36IGU57O7MTg5NzA3MDUwNTALMTU3NzkwNTYyNTUQMjAxMy0wNC0xNCAxMDoyMWQCBg9kFgJmDxUEBjE5NzUwM0PlsbHkuJzogYrln47pmLPosLfljr8gLT4g5rGf6IuP5peg6ZShLOacieWNiuaMgui9pjLovoYs5rGCNDDlkKjotKc7CzE1NTA2Njk3MDk5EDIwMTMtMDItMjggMTA6MzVkAgcPZBYCZg8VBAYxOTczMjYw5rKz5YyX55+z5a625bqEIC0+IOi0temYs++8jOaYhuaYjuS4k+e6vyzmsYLotKc7CzEzODMxMTMyOTY4EDIwMTMtMDItMjggMDg6NDdkAggPZBYCZg8VBAYxOTY2ODdP6YKi5Y+wLS0tLS0tLeWMl+S6rOWPr+S7peS4k+e6v++8jOacieino+aUvui0p+i9puWQjuWFq+i9ruS4gOi+hu+8jOaApeaxgui0p+a6kAsxNTAyODg4Mjc4MhAyMDEzLTAyLTI2IDE4OjA2ZAIJD2QWAmYPFQQGMTk2NjczMOWwgeS4mOiHs+mDkeW3nuS4rei9rOWPkeW+gOWFqOWbve+8jOaciei9puaxgui0pwsxMzQ2MjI3MDUxMhAyMDEzLTAyLTI2IDE3OjI3ZAIKD2QWAgIBDxYCHwBoFgJmD2QWAgIBDw8WAh8AaGRkAgsPZBYCZg8PFgIeC1JlY29yZGNvdW50As5AZGRkaoQ/PIfuAJZzT/BG4Uc6e5uBZF3WOACxXctYK+dFGzM=",
         "__EVENTTARGET": "webPager$WebPager",
         "__EVENTARGUMENT": page + 1,
         "__EVENTVALIDATION": "/wEWCgKg8oqQCwK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBoJG+x3xGqZFPXmEAWZV+uEXME+5gUPMkTEQ+1TYW6ar",
         "SProvinceCityArea1$hidP": -2,
         "SProvinceCityArea1$hidC": -2,
         "SProvinceCityArea1$hidA": -2,
         "SProvinceCityArea1$hidPName": None,
         "SProvinceCityArea1$hidCName": None,
         "SProvinceCityArea1$hidAName": None,
         "txtKeyword": "多个目的地用“,”隔开",
         "webPager$WebPager_input": page,
     }
     url = "http://wl.kywmall.com/wl_search/wl_search_line_list.aspx?type=3&province=-1&city=-1&kword="
     text = self.httpClient.geturlcon(url, data)
     soup = BeautifulSoup(text)
     soup.prettify()
     hrefs = soup.findAll(name="a", href=re.compile("details"))
     id_list = []
     for href in hrefs:
         str_url = str(href).split('"')
         id_list.append(str_url[1].split("=")[1])
     return id_list
コード例 #10
0
def getPM25():
    url = "http://www.pm25.com/city/wuhan.html"

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3",
        "Connection": "keep-alive",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0",
    }
    try:
        req = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(req)
        content = response.read()
        response.close()
        pm = BSoup(content, from_encoding="utf-8")
        logging.info(pm.select(".citydata_updatetime")[0].get_text() + u" ")
        with open('pm2dot5.txt', 'a') as f:
            print>> f, pm.select(".citydata_updatetime")[0].get_text()
            for locate in pm.select(".pj_area_data ul:nth-of-type(1) li"):
                print>> f, locate.select(".pjadt_location")[0].get_text().rjust(15), "\t", \
                    locate.select(".pjadt_aqi")[0].get_text().rjust(15), "\t", \
                    locate.select(".pjadt_quality")[0].get_text().rjust(15), "\t", \
                    locate.select(".pjadt_wuranwu")[0].get_text().rjust(15), "\t", \
                    locate.select(".pjadt_pm25")[0].get_text().rjust(15), "\t", \
                    locate.select(".pjadt_pm10")[0].get_text().rjust(15)
            print>> f, "\n\n\n"
        return 0
    except Exception, e:
        logging.error(e)
        return 1
コード例 #11
0
 def getHuoYuanPage(self, page):
     data = {
         "__VIEWSTATE": "/wEPDwULLTE5ODUzMDUxMDQPZBYEAgMPZBYCZg9kFgICAQ9kFgxmDxYCHgdWaXNpYmxlZ2QCAQ8PFgQeC05hdmlnYXRlVXJsBa0BaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX3Byb2R1Y3RfbGlzdC5hc3B4JTNmdHlwZSUzZDElMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh4DcmVsBQhub2ZvbGxvd2QCAg8PFgIfAQVraHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL2xvZ2luLmFzcHg/Z291cmw9aHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWFuYWdlL21iX2J1c2luZXNzLmFzcHgWAh8CBQhub2ZvbGxvd2QCAw8PFgoeBFRleHQFEuafpeivoui/mOasvuiusOW9lR4IQ3NzQ2xhc3MFCm5hdkZvbnRSZWQfAQUiaHR0cDovL2ZxLmt5d21hbGwuY29tL2luZGV4ZnEuYXNweB4GVGFyZ2V0BQZfYmxhbmseBF8hU0ICAhYCHwIFCG5vZm9sbG93ZAIEDw8WBB8BBbABaHR0cDovL3d3dy5reXdtYWxsLmNvbS9tZW1iZXIvbWVtYmVyL3JlZ2lzdGVyLmFzcHg/Z291cmw9aHR0cCUzYSUyZiUyZndsLmt5d21hbGwuY29tJTJmd2xfc2VhcmNoJTJmd2xfc2VhcmNoX3Byb2R1Y3RfbGlzdC5hc3B4JTNmdHlwZSUzZDElMjZwcm92aW5jZSUzZC0xJTI2Y2l0eSUzZC0xJTI2a3dvcmQlM2QfAGcWAh8CBQhub2ZvbGxvd2QCBQ8PZBYCHwIFCG5vZm9sbG93ZAIFD2QWBAIJDxYCHgtfIUl0ZW1Db3VudAIKFhZmD2QWAmYPFQQGMzQ1NjY5bumZleilv+amhuael+WcsOWMuuelnuacqOWOvy0+5rmW5YyX5a6c5piM5a6c5piM5Y6/77yM5pyJ54Wk54KtMTAwMOWQqO+8jOaxguWkmui+hjEz57GzNDDlkKjovabvvIzku7fpq5jmgKXotbA7KDEzMDk4Mjk4MDk0ICAwOTEyLTg0NTM1MzUgIGxpYW5ncm9uZ2tlICAQMjAxNC0wNS0xOSAxMjoxNmQCAQ9kFgJmDxUEBjM0NTY2OFXmsZ/oi4/lrr/ov4EtPuaxn+iLj+a3ruWuieebseecmeWOv++8jOaciemHjei0pzHlkKjvvIzmsYIx6L6GMTXnsbPovabvvIzku7fpq5jmgKXotbA7DTE1MTYxMjI2ODcwICAQMjAxNC0wNS0xOCAxMDo0MWQCAg9kFgJmDxUEBjM0NTY2NzjotKfmupDvvJrmgKXpnIDopoHmsYLmnInlj5HliLDlm57ljrvovabotKfvvIzlpKfovaYxMuexsw0xNTE1MTg1MTcxMCAgEDIwMTQtMDUtMTMgMTk6MDZkAgMPZBYCZg8VBAYzNDU2NjTXAua5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7msrPljJfpgq/pg7jvvIzmnInph43otKcyMDDvvIzmsYIxM+exs+i9pu+8jOS7t+mrmOaApei1sDvmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT4NMTM5NzQ3MTAyODYgIBAyMDE0LTA1LTA2IDExOjMzZAIED2QWAmYPFQQGMzQ1NjYzqgHmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+WkquWOn++8jOacieW7uuadkDM0LTExMO+8jOaxgjEtM+i+hjEz57Gz6L2m77yMODYwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7lsbHopb/lpKrljp/vvIzmnInlu7rmnZAzNC0xMTDvvIzmsYIxLTPovoYxM+exs+i9pu+8jDg2MDDlhYMvOw0xMzk3NDcxMDI4NiAgEDIwMTQtMDUtMDYgMTE6MzJkAgUPZBYCZg8VBAYzNDU2NjLIAua5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7nlJjogoPmrablqIHlnLDljLrvvIzmnInlu7rmnZAzNe+8jOaxgjEz57Gz6L2m77yMMTcwMDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzs6MTM5NzQ3MTAyODYgIDA3MzQtNDIyNDIxMSAgIDEzOTc1NDE3MzY4ICDogIHpm7fotKfov5Dnq5kgIBAyMDE0LTA1LTA2IDExOjI4ZAIGD2QWAmYPFQQGMzQ1NjYx4QHmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPuWxseilv+i/kOWfju+8jOaciemHjei0pzM177yM5rGCMTPnsbPovabvvIw3ODAw5YWDLzs6MTM5NzQ3MTAyODYgIOiAgembt+i0p+i/kOermSAgMDczNC00MjI0MjExICAgMTM5NzU0MTczNjggIBAyMDE0LTA1LTA2IDExOjI4ZAIHD2QWAmYPFQQGMzQ1NjYw1wLmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5rKz5YyX6YKv6YO477yM5pyJ6YeN6LSnMjAw77yM5rGCMTPnsbPovabvvIzku7fpq5jmgKXotbA75rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7msrPljJfpgq/pg7jvvIzmnInph43otKcyMDDvvIzmsYIxM+exs+i9pu+8jOS7t+mrmOaApei1sDvmuZbljZfooaHpmLPogJLpmLPluIItPuays+WMl+mCr+mDuO+8jOaciemHjei0pzIwMO+8jOaxgjEz57Gz6L2m77yM5Lu36auY5oCl6LWwO+a5luWNl+ihoemYs+iAkumYs+W4gi0+DTEzOTc0NzEwMjg2ICAQMjAxNC0wNS0wNiAxMDozM2QCCA9kFgJmDxUEBjM0NTY1OaoB5rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7lsbHopb/lpKrljp/vvIzmnInlu7rmnZAzNC0xMTDvvIzmsYIxLTPovoYxM+exs+i9pu+8jDg2MDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+5bGx6KW/5aSq5Y6f77yM5pyJ5bu65p2QMzQtMTEw77yM5rGCMS0z6L6GMTPnsbPovabvvIw4NjAw5YWDLzsNMTM5NzQ3MTAyODYgIBAyMDE0LTA1LTA2IDEwOjMyZAIJD2QWAmYPFQQGMzQ1NjU4yALmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy875rmW5Y2X6KGh6Ziz6ICS6Ziz5biCLT7nlJjogoPmrablqIHlnLDljLrvvIzmnInlu7rmnZAzNe+8jOaxgjEz57Gz6L2m77yMMTcwMDDlhYMvO+a5luWNl+ihoemYs+iAkumYs+W4gi0+55SY6IKD5q2m5aiB5Zyw5Yy677yM5pyJ5bu65p2QMzXvvIzmsYIxM+exs+i9pu+8jDE3MDAw5YWDLzvmuZbljZfooaHpmLPogJLpmLPluIItPueUmOiCg+atpuWogeWcsOWMuu+8jOacieW7uuadkDM177yM5rGCMTPnsbPovabvvIwxNzAwMOWFgy87OjEzOTc0NzEwMjg2ICAwNzM0LTQyMjQyMTEgICAxMzk3NTQxNzM2OCAg6ICB6Zu36LSn6L+Q56uZICAQMjAxNC0wNS0wNiAxMDoyOGQCCg9kFgICAQ8WAh8AaBYCZg9kFgICAQ8PFgIfAGhkZAILD2QWAmYPDxYCHgtSZWNvcmRjb3VudALYjRFkZGS0MkBs5Z/XXB/pf4OF7cvKZ50NBi3Fx9/BpGgMqe3mUQ==",
         "__EVENTTARGET": "webPager$WebPager",
         "__EVENTARGUMENT": page + 1,
         "__EVENTVALIDATION": "/wEWCgKvz7e5CAK/rvSaAwKewauqDALTxLDBDwKJl+zrAwKIqsLKAgLTupi8CgLxq8OpDQK+sdPcCAKM54rGBuEJlK4dUlfESR5ctz/iPRP2/0Ifmh19XKMdEd06zeOc",
         "SProvinceCityArea1$hidP": -2,
         "SProvinceCityArea1$hidC": -2,
         "SProvinceCityArea1$hidA": -2,
         "SProvinceCityArea1$hidPName": None,
         "SProvinceCityArea1$hidCName": None,
         "SProvinceCityArea1$hidAName": None,
         "txtKeyword": "多个目的地用“,”隔开",
         "webPager$WebPager_input": page,
     }
     url = "http://wl.kywmall.com/wl_search/wl_search_product_list.aspx?type=1&province=-1&city=-1&kword="
     text = self.httpClient.geturlcon(url, data)
     soup = BeautifulSoup(text)
     soup.prettify()
     hrefs = soup.findAll(name="a", href=re.compile("details"))
     id_list = []
     for href in hrefs:
         str_url = str(href).split('"')
         id_list.append(str_url[1].split("=")[1])
     return id_list
コード例 #12
0
ファイル: __init__.py プロジェクト: Frihet/djangomail
def scrub_html_email(text, cid_mapping={}):

    from BeautifulSoup import BeautifulSoup

    soup = BeautifulSoup(text)

    for tag in soup.findAll(True):
        attrs = dict(tag.attrs)
        if 'src' in attrs:
            src = attrs['src']
            if src[:4]=='cid:':
                tag['src'] = cid_mapping[src[4:]]

    mapped = soup.renderContents()

    scrubber = tuit.scrubber.Scrubber(autolink=False)

    # The scrubber removes complete html documents out of the box? Weird...
    scrubber.disallowed_tags_save_content.add('html')
    scrubber.disallowed_tags_save_content.add('body')
    scrubber.disallowed_tags_save_content.add('xml')
    scrubber.disallowed_tags_save_content.add('doctype')
    scrubber.allowed_attributes.add('color')
    scrubbed = scrubber.scrub(mapped)
    
    return scrubbed
コード例 #13
0
ファイル: crawler.py プロジェクト: egornevezhin/crawler
	def parseWebPageContent(self, html):
		'''
		Парсинг контента: извлечение и анализ ссылок, дополнение очереди
		string html HTML-код страницы
		'''
		#print envEncode(html)
		soup = BeautifulSoup(html)
		for a in soup.findAll('a'):
			if self.checkIfLinkShouldBeFollowed(a):
				url = a['href']
				#	разбиваем ссылку
				urlsplitResult = urlparse.urlsplit(url)
				#	локальные ссылки не должны включаться
				
				#	задано доменное имя?
				if urlsplitResult.scheme == '':
					scheme = 'http'
				else:
					scheme = urlsplitResult.scheme
				#	собираем ссылку обратно
				url = urlparse.urlunsplit((scheme, 
						self.domain,
						urlsplitResult.path,
						urlsplitResult.query,
						'',))
				if url not in self.linksToFollow:
					self.linksToFollow.append(url)
コード例 #14
0
ファイル: http.py プロジェクト: Crypt0s/Ramen
    def __parse(self,result = None):
        type = result.getheader('content-type')

        if type == None or 'text' not in type:
            # return the result object.
            return (result.url, [], [])
        # Check to see if this was called from redirect
        soup = BeautifulSoup(result.read())
        urls = []
        # These two attributes are going to be the most reliable
        # TODO: I should let the tag attributes we want to pull urls out of be usr-defined in the fs settings.
        for tag in soup.findAll(href=True):
            urls.append(tag['href'])
        for tag in soup.findAll(src=True):
            urls.append(tag['src'])
        # OK, now see if the URL's match the target.
        # TODO: do we want to check if the url is on the same IP or not?
        valid_urls = []
        for url in urls:
            v_url = self.__validateURL(url)
            if v_url is not None:
                # See if it's not already marked for scanning
                if v_url not in self.scanned and v_url not in self.to_scan:
                    valid_urls.append(v_url)
                    self.to_scan.append(v_url)

        print len(self.to_scan)
        return (result.url,[],valid_urls)
コード例 #15
0
def getList(base_url, list_kind):
    c = urllib2.urlopen(base_url + list_kind + "?sort=time&start=0&filter=all&mode=grid&tags_sort=count")
    soup = BeautifulSoup(c.read())
    c.close()
    totalNumber = soup.find("span", {"class": "subject-num"}).contents[0]
    separator = "/"
    #    print totalNumber
    totalNumber = totalNumber[totalNumber.find(separator) + 7 :]
    totalNumber = string.atoi(totalNumber)
    #    print totalNumber

    bookList = soup.findAll("a", {"class": "nbg"})
    #    print bookList

    beginNumber = 15
    while totalNumber - 15 > 0:
        c = urllib2.urlopen(
            base_url + list_kind + "?sort=time&start=" + str(beginNumber) + "&filter=all&mode=grid&tags_sort=count"
        )
        soup = BeautifulSoup(c.read())
        c.close()
        bookList = bookList + soup.findAll("a", {"class": "nbg"})
        totalNumber = totalNumber - 15
        beginNumber += 15
    return bookList
コード例 #16
0
def GetTable():
	wiki   = "http://en.wikipedia.org/wiki/List_of_districts_of_Germany"
	header = {'User-Agent': 'Mozilla/5.0'} #Needed to prevent 403 error on Wikipedia
	req    = urllib2.Request(wiki,headers=header)
	page   = urllib2.urlopen(req)
	soup   = BeautifulSoup(page)
 
	table = soup.find("table", { "class" : "wikitable sortable" })
	print table


	#columns of the wikitable 
	# District
	# Type 	
	# Land 	
	# Capital 
	T = [];
	for row in table.findAll("tr"):
    	cells = row.findAll("td")
    	#For each "tr", assign each "td" to a variable.
    	if len(cells) == 4:
        	T.append([ cells[0].find(text=True), cells[1].find(text=True), cells[2].find(text=True), cells[3].find(text=True) ]);

	#now we have list but we would like to have the coordinates.
	# We will ask gmap for the coordinates...
コード例 #17
0
ファイル: PackageSplitter.py プロジェクト: cmsdoxy/tools
 def CreateSubPage(self, packageName):
     if not self.data:
         self.PrepareData()
     tab = self.GenerateTab(current = packageName)
     counter = 0
     htmlList = '<table class="directory">\n<tbody>\n'
     keysI = self.data[packageName].keys()
     keysI.sort()
     for i in keysI:
         if counter % 2 == 0:
             htmlList += '<tr id="row_%d_" class="even">\n' % counter
         else:
             htmlList += '<tr id="row_%d_">\n' % counter
         htmlList += '<td class="entry">\n<img src="ftv2node.png" alt="o" width="16" height="22">\n'
         htmlList += '<a class="el" href="%s" target="_self">%s</a>\n' % (self.data[packageName][i], i)
         htmlList += '</td>\n<td class="desc">\n</td>\n</tr>\n'
         
         counter += 1
     htmlList += '</tbody>\n</table>\n'
     
     temp = copy.deepcopy(self.packageSource)
     soup = BeautifulSoup(temp)
     list_  = soup.find('div', { "class" : "directory" })
     list_.replaceWith(htmlList)
     
     tab_  = soup.find('ul', { "class" : "tablist" })
     tab_.replaceWith(tab_.prettify() + tab)
     
     data = str(soup.prettify())
     
     self.WriteFile(self.__GetFileName(packageName), data.replace('&lt;','<').replace('&gt;', '>'))
コード例 #18
0
ファイル: wutongParser.py プロジェクト: synckey/logistics
    def pharseHYMeta(self, text, id):
        kv_dic = WTHuoYuan.get_k_v_dic()
        result = {}
        result["webSiteId"] = id

        soup = BeautifulSoup(text)
        soup.prettify()

        table = soup.findAll(attrs={"class": "mt10"})
        tds = table[0].findAll("td")
        tds.extend(table[1].findAll("td"))
        key = None
        value = None
        allInfo = {}
        for td in tds:
            text = "".join(td.fetchText(True)).strip()
            if text.endswith(":"):
                key = text.replace(":", "")
            else:
                value = text
                allInfo[key] = value
                key = None
                value = None
        for k, v in allInfo.iteritems():
            if k:
                result[kv_dic.get(k)] = v
        return result
コード例 #19
0
ファイル: check_mesos.py プロジェクト: amplab/ampcamp
def check_mesos_html(mesos_html):
  ## Find number of cpus from status page
  html_soup = BeautifulSoup(mesos_html)
  cpus_str = html_soup.findAll('td')[2].contents[0]
  mesos_num_cpus = int(cpus_str.strip("CPUs"))

  print "Mesos master reports " + str(mesos_num_cpus) + " CPUs"
コード例 #20
0
 def testCData(self):
     xml = "<root>foo<![CDATA[foobar]]>bar</root>"
     self.assertSoupEquals(xml, xml)
     r = re.compile("foo.*bar")
     soup = BeautifulSoup(xml)
     self.assertEquals(soup.find(text=r).string, "foobar")
     self.assertEquals(soup.find(text=r).__class__, CData)
コード例 #21
0
 def testComments(self):
     xml = "foo<!--foobar-->baz"
     self.assertSoupEquals(xml)
     r = re.compile("foo.*bar")
     soup = BeautifulSoup(xml)
     self.assertEquals(soup.find(text=r).string, "foobar")
     self.assertEquals(soup.find(text="foobar").__class__, Comment)
コード例 #22
0
 def testSiblings(self):
     soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
     secondLI = soup.find('li').nextSibling
     self.assert_(secondLI.name == 'li' and secondLI.string == '2')
     self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
     self.assertEquals(soup.find('p').nextSibling, 'B')
     self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
コード例 #23
0
def linkSpider(url, linkNum, repeatVal):
    # Loop counter
    searchRepeat = 0
    # Temp holding for links.
    links = list()
    # Turns link into searchable soup.
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html)
    # Loops as long as the increment variable is less than the requested repeat
    # value.
    while searchRepeat < int(repeatVal):
        # Allows you to update the soup from new links.
        soup = BeautifulSoup(html)
        # Loops through current soup looking for links and appends the list
        # temporarily.
        for link in soup.findAll("a", href=True):
            print (link.get("href", None))
            links.append(link.get("href", None))
            # Helps you see a break between searches.
        print "Break"
        # Changes the url to the new url. Offset used to compensate for
        # difference between computer starting at 0 and humans starting at 1
        # when counting objects.
        newUrl = links[int(linkNum) - 1]
        html = urllib.urlopen(newUrl).read()
        # Clears out the list so the new soup can be placed inside and indexed.
        links[:] = []

        searchRepeat += 1

    print "Final URL is: ", newUrl
コード例 #24
0
 def testSiblings(self):
     soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
     secondLI = soup.find("li").nextSibling
     self.assert_(secondLI.name == "li" and secondLI.string == "2")
     self.assertEquals(soup.find(text="1").nextSibling.name, "p")
     self.assertEquals(soup.find("p").nextSibling, "B")
     self.assertEquals(soup.find("p").nextSibling.previousSibling.nextSibling, "B")
コード例 #25
0
    def testQuotedAttributeValues(self):
        self.assertSoupEquals("<foo attr='bar'></foo>",
                              '<foo attr="bar"></foo>')

        text = """<foo attr='bar "brawls" happen'>a</foo>"""
        soup = BeautifulSoup(text)
        self.assertEquals(soup.renderContents(), text)

        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
        newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
        self.assertSoupEquals(soup.renderContents(), newText)

        self.assertSoupEquals('<this is="really messed up & stuff">',
                              '<this is="really messed up &amp; stuff"></this>')

        # This is not what the original author had in mind, but it's
        # a legitimate interpretation of what they wrote.
        self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
        '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')

        # SGMLParser generates bogus parse events when attribute values
        # contain embedded brackets, but at least Beautiful Soup fixes
        # it up a little.
        self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"></a>')
        self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
                              """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
コード例 #26
0
ファイル: zhihu.py プロジェクト: sunyaoxiang/BigFace
 def ask(self):
     html =  self.session.get(self.zhi + '/people/Grapher/asks').text
     soup = BeautifulSoup(html)
     ask_gen = ((self.zhi + str(ask.get('href')) + ' ' + str(ask.get_text().encode('utf-8'))) \
                 for ask in soup.find_all('a', 'question_link'))
     for ask in ask_gen:
         print ask
コード例 #27
0
ファイル: views.py プロジェクト: TristanDamron/PowerSearch
def getMetaData(sites, url):
    #Instantiate an array of related urls
    related = []
    
    #Open the url
    site = BeautifulSoup(str(urllib2.urlopen(url)))

    #Get all of the meta data keywords
    meta = site.findAll("meta")

    #Go through all of the sites
    for s in sites.all():
        #Open that site and get its meta data keywords
        tmp = BeautifulSoup(urllib2.urlopen(s.url).findAll("meta"))

        #Go through all of the meta keywords in meta
        for m in meta:
            #Go through all of the meta keywords in tmp
            for t in tmp:
                #If m == t...
                if m is t:
                    #It's a related site!
                    related.append(str(s))
    
    return related
コード例 #28
0
ファイル: main.py プロジェクト: ekelleyv/NewTrueBlue
def main():
	htmlpage = open('TowerVoting.html')
	soup = BeautifulSoup(htmlpage)
	people = []
	
	table= soup.find("table")
	body = table.find("tbody")
	tds = body.findAll("td")

	for td in tds:
		person = {}
		person["image"] = td.img["src"][21:]
		person["name"] = td.div.contents[0]
		people.append(person)

	for person in people:
		input = raw_input("Is {} a man?".format(person["name"]))
		if (input == ""):
			person["male"] = False
		else:
			person["male"] = True

	output_file = open('members.json', 'w')
	
	output = json.dumps(people)

	output_file.write(output)
コード例 #29
0
ファイル: bips.py プロジェクト: mattdeboard/pf_random
def post():
        headers = login()
        thread_ids = []
        searchurl = 'http://postfarm.net/search.php?do=getdaily'
        srchresp, srchcont = http.request(searchurl, 'GET', headers = headers)

        soup = BeautifulSoup(srchcont)
        td_list = soup.findAll('td', id=re.compile('td_threadtitle_\d+'))

        for td in td_list:
                id = td['id']
                match = re.match("td_threadtitle_(\d+)", id)
                thread_ids.append(match.group(1))

        randurl, rand_thread_id = random_url(thread_ids)

        page = BeautifulSoup(randurl)
        token_value = str(soup.find('input', attrs={'name':'securitytoken'})['value'])
        messagebody = {'message':'who wants some free nikes? we make em cheap',
                       'fromquickreply':'1',
                       's':'',
                       'securitytoken':token_value,
                       'do':'postreply',
                       't':rand_thread_id,
                       'p':'who cares',
                       'parseurl':'1'}
        resp, cont = http.request(randurl, 'POST', headers=headers, body=urllib.urlencode(messagebody))
コード例 #30
0
 def testUnicodePickle(self):
     import cPickle as pickle
     html = "<b>" + chr(0xc3) + "</b>"
     soup = BeautifulSoup(html)
     dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
     loaded = pickle.loads(dumped)
     self.assertEqual(loaded.decode(), soup.decode())
コード例 #31
0
    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        post_data = urllib.urlencode(
            (
                ("REGFROMDATE.MAINBODY.WPACIS.1.",
                 search_date.strftime(date_format)),
                ("REGTODATE.MAINBODY.WPACIS.1.",
                 search_date.strftime(date_format)),
                ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"),
            ))

        response = urllib2.urlopen(self.search_url, post_data)
        contents = response.read()

        # Let's give scrapers the change to tidy up any rubbish - I'm looking
        # at you Cannock Chase
        contents = self._fixHTML(contents)

        # Check for the no results warning
        if not contents.count("No Matching Applications Found"):
            soup = BeautifulSoup.BeautifulSoup(contents)

            # Get the links to later pages of results.
            later_pages = soup.findAll(
                "a", {
                    "href":
                    re.compile(
                        "WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*")
                })

            for a in ["initial_search"] + later_pages:
                if a != "initial_search":
                    url = a['href']

                    # Example url

                    #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=<a href=wphappcriteria.display?paSearchKey=147170>Search Criteria</a>

                    # urllib2 doesn't like this url, to make it happy, we'll
                    # get rid of the BackURL parameter, which we don't need.

                    split_url = urlparse.urlsplit(url)
                    qs = split_url[3]

                    # This gets us a dictionary of key to lists of values
                    qsl = cgi.parse_qsl(qs)

                    # Get rid of BackURL
                    qsl.pop(-1)

                    # I think this is safe, as there are no repeats of parameters
                    new_qs = urllib.urlencode(qsl)

                    url = urlparse.urlunsplit(split_url[:3] + (new_qs, ) +
                                              split_url[4:])

                    this_page_url = urlparse.urljoin(self.base_url, url)
                    response = urllib2.urlopen(this_page_url)
                    contents = response.read()
                    soup = BeautifulSoup.BeautifulSoup(contents)

                results_table = self._findResultsTable(
                    soup)  #.body.find("table", {"class": "apas_tbl"})

                trs = self._findTRs(results_table)

                for tr in trs:
                    self._current_application = PlanningApplication()

                    tds = tr.findAll("td")

                    # The first td

                    #<td class="apas_tblContent"><a href="WPHAPPDETAIL.DisplayUrl?theApnID=07/1884&amp;backURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt; &gt; &lt;a href='wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt;'&gt;Search Results&lt;/a&gt;"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a> > <a href="wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt;"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a>'>Search Results">07/1884</td>

                    # The html here is a bit of a mess, and doesn't all get into
                    # the soup.
                    # We can get the reference from the first <a href> in td 0.
                    first_link = tds[0].a['href']

                    app_id = cgi.parse_qs(
                        urlparse.urlsplit(first_link)[3])['theApnID'][0]

                    self._current_application.date_received = search_date
                    self._current_application.council_reference = app_id
                    self._current_application.info_url = self.info_url % (
                        app_id)
                    self._current_application.comment_url = self.comment_url % (
                        app_id)
                    self._current_application.description = tds[
                        1].string.strip()

                    # the second td

                    #<td class="apas_tblContent"><input type="HIDDEN" name="ORDERCOUNTER.PAHEADER.PACIS2.1-1." value="1" class="input-box" size="7" />
                    #LAND ADJ. BRAMBLING, HAWKENBURY ROAD, HAWKENBURY, TN120EA
                    #</td>

                    address = ' '.join([
                        x for x in tds[2].contents
                        if isinstance(x, BeautifulSoup.NavigableString)
                    ]).strip()

                    self._current_application.address = address
                    self._current_application.postcode = getPostcodeFromText(
                        address)

                    self._results.addApplication(self._current_application)

        return self._results
コード例 #32
0
ファイル: RingzerCodingChal4.py プロジェクト: nightowl97/ctf
import BeautifulSoup
import requests
import os

url = 'https://ringzer0team.com/challenges/32/'  # challenge url
creds = {
    'username': '******',  # Replace with your creds
    'password': '******'  # in this dictionary
}

with requests.session() as s:
    # Login to site:
    postreq = s.post('https://ringzer0team.com/login', data=creds)
    # Go to challenge page now that we're logged in:
    page = s.get(url).content
    soup = BeautifulSoup.BeautifulSoup(page)
    # We know there's just one div result:
    div = soup.findAll('div', {'class': 'message'})[0]
    # Extract the text as string and remove useless garbage
    message = str(div.text)[25:-23]
    ops = message.split(' ')
    result = int(ops[0]) + int(ops[2], 16) - int(ops[4], 2)
    # Construct the URL and open it in the browser (make sure you're logged in in your browser as well)
    url += str(result)
    os.startfile(url)
コード例 #33
0
import urllib
from BeautifulSoup import *

url = raw_input('Enter - ')
html = urllib.urlopen(url).read()

soup = BeautifulSoup(html)

# Retrieve all of the anchor tags
tags = soup('span')
count = 0
for tag in tags:
    count += int(tag.contents[0])

print count
"""
   # Look at the parts of a tag
   print 'TAG:',tag
   print 'URL:',tag.get('class="comments"', None)
   print 'Contents:',tag.contents[0]
   print 'Attrs:',tag.attrs
   """
コード例 #34
0
ファイル: google.py プロジェクト: al-layth/denigma
def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0):
    """
    Search the given query string using Google.

    @type  query: str
    @param query: Query string. Must NOT be url-encoded.

    @type  tld: str
    @param tld: Top level domain.

    @type  lang: str
    @param lang: Languaje.

    @type  num: int
    @param num: Number of results per page.

    @type  start: int
    @param start: First result to retrieve.

    @type  stop: int
    @param stop: Last result to retrieve.
        Use C{None} to keep searching forever.

    @type  pause: float
    @param pause: Lapse to wait between HTTP requests.
        A lapse too long will make the search slow, but a lapse too short may
        cause Google to block your IP. Your mileage may vary!

    @rtype:  generator
    @return: Generator (iterator) that yields found URLs. If the C{stop}
        parameter is C{None} the iterator will loop forever.
    """

    # Set of hashes for the results found.
    # This is used to avoid repeated results.
    hashes = set()

    # Prepare the search string.
    query = urllib.quote_plus(query)

    # Grab the cookie from the home page.
    get_page(url_home % vars())

    # Prepare the URL of the first request.
    if num == 10:
        url = url_search % vars()
    else:
        url = url_search_num % vars()

    # Loop until we reach the maximum result, if any (otherwise, loop forever).
    while not stop or start < stop:

        # Sleep between requests.
        time.sleep(pause)

        # Request the Google Search results page.
        html = get_page(url)

        # Parse the response and process every anchored URL.
        soup = BeautifulSoup.BeautifulSoup(html)
        anchors = soup.findAll('a')
        for a in anchors:

            # Get the URL from the anchor tag.
            try:
                link = a['href']
            except KeyError:
                continue

            # Filter invalid links and links pointing to Google itself.
            link = filter_result(link)
            if not link:
                continue

            # Discard repeated results.
            h = hash(link)
            if h in hashes:
                continue
            hashes.add(h)

            # Yield the result.
            yield link

        # Prepare the URL for the next request.
        start += num
        if num == 10:
            url = url_next_page % vars()
        else:
            url = url_next_page_num % vars()
コード例 #35
0
ファイル: thebiz.py プロジェクト: andikrasta/webploit
	def served(self):
		t = urllib2.urlopen(self.headers["Referer"])
		html = t.read()
		soup = BeautifulSoup.BeautifulSoup(html)
		body = soup.find(["body"])
		return body
コード例 #36
0
ファイル: get_price_improve.py プロジェクト: alphabetz/python
# get rice price from CPF the rice's price

import urllib
from BeautifulSoup import *


class bcolors:
    OKGREEN = '\033[92m'
    BOLD = '\033[1m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'


url = 'http://www.cpffeed.com/price_detail.html?product=8'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
price = soup.findAll("td", {"class": "price_product"})
'''
# Calculate price changed
last_price = float(price[10].contents[0].encode('utf-8'))
cur_price = float(price[3].contents[0].encode('utf-8'))
change = cur_price - last_price
'''
print "Fetched price from CPF"
print "---------------------"

for i in xrange(1, len(price), 7):
    try:
        last_price = float(price[i + 9].contents[0].encode('utf-8'))
        cur_price = float(price[i + 2].contents[0].encode('utf-8'))
        change = cur_price - last_price
コード例 #37
0
__author__ = 'stevenkaplan'
from sefaria.model import *
from BeautifulSoup import *
from sources.functions import *
if __name__ == "__main__":
    contents = BeautifulSoup(open("alt_struct.xml")).contents[2].contents[3]
    nodes = []
    contents = filter(lambda x: type(x) is not NavigableString, contents)
    for count, each in enumerate(contents):
        en, he = each.attrs[0][1].split(" / ")
        node = ArrayMapNode()
        node.add_primary_titles(en, he)
        node.depth = 0
        if count == 0:
            node.wholeRef = "Mesillat Yesharim, Introduction"
        else:
            node.wholeRef = "Mesillat Yesharim {}".format(count)
        node.refs = []
        nodes.append(node.serialize())

    index = get_index("Mesillat Yesharim", server="http://www.sefaria.org")
    index['alt_structs'] = {"Subject": {"nodes": nodes}}
    post_index(index, server="http://www.sefaria.org")
コード例 #38
0
from pprint import pprint
import BeautifulSoup
import requests
import re
import urllib

print "Kranthi"
#product url
url = 'http://www.amazon.com/dp/B0074R0Z3O'
response = requests.get(url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'})

soup = BeautifulSoup.BeautifulSoup(response.content)
print soup.find(id="productTitle").string

#preparing dic to store info
dic={}
dic['title'] = soup.find(id="productTitle").string

#array of reviews : MOST helpful reviews
reviewURLarr=[]
for div in soup.findAll(id=re.compile('^rev-dpReviewsMostHelpfulAUI-.*')):
	for reviewURL in div.findAll('a',{"class": "a-link-normal a-text-normal a-color-base" },href=True):
		reviewURLarr.append(str(reviewURL['href']))
print len(set(reviewURLarr))

#getting into review url and grabing the data
revdic={}
for reviewURL in set(reviewURLarr):
	# print reviewURL
	reviewResponse = requests.get(reviewURL, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36'})
	reviewSoup = BeautifulSoup.BeautifulSoup(reviewResponse.content)
コード例 #39
0
import BeautifulSoup
import urllib
import re
u = 'http://www.xfront.com/us_states/'
soup = BeautifulSoup.BeautifulSoup(urllib.urlopen(u))
m = re.compile(
    r'<li>\n<p>Name: [\w ]*</p>\n<p>Capital Name: ([\w ]*)</p>\n<p>Capital Latitude: ([\d\.]*)</p>'
)
for a in sorted([m.match(str(x)).groups() for x in soup.findAll('li')],
                key=lambda x: x[1]):
    print a[0]
コード例 #40
0
 t = t + 1
 if t%100 == 0:
     print str(float(t)/45)+'%'
 word = word[:-1].split('\t')
 xq = word[1]
 word = ''.join(word)
 #print xq,word
 params = {
             'word':word,
             'type':'0',
             'pn':'0',
             'rn':'10',
             'submit':'search'
           }
 result = html.get(url, params)
 soup = BeautifulSoup.BeautifulSoup(result, fromEncoding="gb18030")
 for ele in soup.findAll('h2'):
     try:
         xiaoquName = ""
         #tmp = unicode(ele.a.strong.contents[0])
         for e in ele.a.contents:
             if not isinstance(e, BeautifulSoup.NavigableString):
                 boldKey = unicode(e.contents[0])
                 #print boldKey
                 xiaoquName = xiaoquName + boldKey
             else:
                 key = unicode(e)
                 xiaoquName = xiaoquName + key
             #print xiaoquName
         if str(boldKey) == xq:
             dic[boldKey] = xiaoquName
コード例 #41
0
import urllib
import BeautifulSoup

inputfile = open("input.txt", "r")
courses = []
for line in inputfile:
    line = line.rstrip()
    if line == "":
        continue
    courses.append(line)
inputfile.close()

outputfile = open("output.html", "w")
outputfile.write("<html><body>")
for course in courses:
    page = urllib.urlopen("http://www.mcgill.ca/study/2011-2012/courses/" +
                          course.replace(" ", "-"))
    soup = BeautifulSoup.BeautifulSoup(page.read())
    page.close()
    title = soup.findAll('h1')
    outputfile.write(str(title[1]).replace("h1", "h3"))
    result = soup.findAll('div', 'content')
    """Assuming it's the 3rd such div, but this might change!"""
    outputfile.write(result[2].prettify())
    outputfile.write("\n<br><hr>")
outputfile.write("</body></html>")

outputfile.close()
コード例 #42
0
ファイル: wrap_apis.py プロジェクト: sterin/pywrapper
def download(base):
    f = open('/usr/share/doc/python/html/c-api/%s.html' % base, 'r')
    soup = BeautifulSoup.BeautifulSoup(f.read())

    return soup.findAll('dl', attrs={'class': 'function'})
コード例 #43
0
kmlStartString = '<?xml version="1.0" encoding="UTF-8"?>\n<kml xmlns="http://www.opengis.net/kml/2.2"\nxmlns:gx="http://www.google.com/kml/ext/2.2">\n'  #here's the root node
#kmlStartString is the top of the kml file, it has the header info and styling for the placemarks in the tour.
kmlStartString += '<Document>\n<StyleMap id="weogeo_logo_map">\n<Pair>\n<key>normal</key>\n<styleUrl>#weogeo_logo</styleUrl>\n</Pair>\n<Pair>\n<key>highlight</key>\n<styleUrl>#weogeo_logo_h</styleUrl>\n</Pair>\n</StyleMap>\n<Style id="weogeo_logo">\n<IconStyle>\n<scale>1.1</scale>\n<Icon>\n<href>http://market.weogeo.com/ge/global/weo_button_small.png</href>\n</Icon>\n<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>\n</IconStyle>\n</Style>\n<Style id="weogeo_logo_h">\n<IconStyle>\n<scale>1.3</scale>\n<Icon>\n<href>http://market.weogeo.com/ge/global/weo_button_small.png</href>\n</Icon>\n<hotSpot x="20" y="2" xunits="pixels" yunits="pixels"/>\n</IconStyle>\n</Style>\n<name>WeoGeo Kml Tour</name>\n<open>1</open>\n'
#kmlTourString is the string that holds the tour information ie: the playlist, etc.
kmlTourString = '<gx:Tour>\n<name>WeoGeo Tour</name>\n<gx:Playlist>\n'
kmlOverlayString = ''
kmlPlacemarkString = ''
#here's where we're going to have to add an entry for each map.
for fileName in os.listdir('tourKmls'):
    kmlFile = open(os.path.join('tourKmls', fileName),
                   'r')  #we open a file from tourKmls folder
    individualKmlString = kmlFile.read()  #read contents to string
    kmlFile.close()  #close the file
    #print "doing " + fileName # debug
    try:  #this block extracts the coordinates, the overlay tags, the names
        weoSoup = BeautifulSoup.BeautifulSoup(individualKmlString)
        documentContents = weoSoup.kml.document.contents
        for tag in documentContents:
            try:
                if tag.name == 'name':
                    nameString = tag.string
                    break
            except:
                pass
        overlayString = str(weoSoup.find('groundoverlay'))
        overlaySoup = BeautifulSoup.BeautifulSoup(overlayString)
        overlayContents = overlaySoup.contents
        for tag in overlayContents:
            if tag.name == 'name':
                tag.replaceWith('<name>' + nameString + '</name>')
        overlaySoup.groundoverlay['id'] = fileName
コード例 #44
0
ファイル: bs1.py プロジェクト: dhruvcoder96/ScriptScrapper
#beautiful soup program

import urllib
from BeautifulSoup import *
site = raw_input("Enter the site: ")
handle = urllib.urlopen('http://' + site).read()
#handle for beautiful soup
handle1 = BeautifulSoup(handle)
lst = handle1('script')

for tags in lst:
    print tags.get('src', None)
コード例 #45
0
import urllib
from BeautifulSoup import *

url = raw_input('Enter url: ')
count = int(raw_input('Enter number of times to repeat: '))
position = int(raw_input('Enter position number: '))

links = list()

while count >= 0 :
  html = urllib.urlopen(url).read()
  formattedHtml = BeautifulSoup(html)
  tags = formattedHtml('a')
  for tag in tags :
    links.append(str(tag.get('href', None)))
  print 'Retrieving: ' + url
  # find the url at position - 1 offset for index starting at 0
  url = links[position-1]
  # refresh the links list to the new list of the next url
  del links[:]
  count = count - 1
コード例 #46
0
ファイル: e9.py プロジェクト: stambik13/ergasies-patsak
import urllib
import BeautifulSoup
kwords=raw_input("dose tis lexeis kleidia: ")
kwords=kwords.split(",")	
br1=urllib.urlopen("http://www.brewerydb.com/style/88")
beer1=BeautifulSoup(br1.read(),"html.parser")
for i in beer1.find_all("class":"description"):
	d1=i.get_text()
	for j in keywords:
		if (beer1[i]==keywords[j]):
			mx1=mx1+1
br2=urllib.urlopen("http://www.brewerydb.com/style/62")
beer2=BeautifulSoup(br2.read(),"html.parser")
for i in beer2.find_all("class":"description"):
	d2=i.get_text()
	for j in keywords:
		if (beer2[i]==keywords[j]):
			mx2=mx2+1
br3=urllib.urlopen("http://www.brewerydb.com/style/119")
beer3=BeautifulSoup(br1.read(),"html.parser")
for i in beer3.find_all("class":"description"):
	d3=i.get_text()
	for j in keywords:
	    if (beer3[i]==keywords[j]):
		    mx3=mx3+1
br4=urllib.urlopen("http://www.brewerydb.com/style/15")
beer4=BeautifulSoup(br1.read(),"html.parser")
for i in beer4.find_all("class":"description"):
    d4=i.get_text()
    for j in keywords:
	    if (beer4[i]==keywords[j]):
コード例 #47
0
ファイル: main.py プロジェクト: yoonsungwon/Study_py
url = 'https://onoffmix.com/event/'

driver = webdriver.Chrome('/webdriver/chromedriver')

driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("setMorePrint()")
time.sleep(10)
#driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#time.sleep(10)
req = driver.page_source

#headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'}
#req = requests.get(url, headers=headers)
soup = BeautifulSoup.BeautifulSoup(req)

h = HTMLParser()
soup2 = soup.find('div', attrs={'class': 'contentBox todayEventArea'})
with open('result.html', 'w') as f:
    f.write("""<!-- C\Code\mysite\elections\templates\elections\index.html -->
<!DOCTYPE html>
<html lang="en">
<head>
  <title>강의 목록</title>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/css/bootstrap.min.css">
  <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
  <script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js"></script>
</head>
コード例 #48
0
ファイル: TableParser.py プロジェクト: roksys/cmsdoxy
if __name__ == "__main__":
    if len(sys.argv) < 2:
        sys.stderr.write("not enough parameter!\n")
        sys.exit(1)

    # initialize variables
    htmlFullPath = sys.argv[1]
    htmlFilePath = os.path.split(htmlFullPath)[0]
    htmlFileName = os.path.split(htmlFullPath)[1]
    fileNameTemplate = htmlFileName.replace('.html', '_%s.html')

    # load the html page
    with open(htmlFullPath) as f:
        htmlPage = f.read()
        htmlPage = BeautifulSoup(htmlPage)

    # please have a look at the pages.html page. You will see that class name
    # of the related tab, which we will use to put 'index tab' by using this
    # tab, is different for pages.html file. For namespaces.html (namespace
    # list) and annotated.html (~class list) files, class names are the same
    # tabs2. this is why we are setting 'the destination tab class name' up
    # differently depending on the html file name.
    if htmlFileName == 'packageDocumentation.html':
        pages = extractPagesForPackage()
        destTabClassName = 'tabs'
    elif htmlFileName == 'configfiles.html':
        pages = extractPages(configFileFlag=True)
        destTabClassName = 'tabs2'
    else:
        pages = extractPages()
コード例 #49
0
 def access_home(self):
   r = self.opener.open(self.scrape_url)
   page = r.read()
   soup = BeautifulSoup.BeautifulSoup(page)
   self.ICSID = soup.find('input', {'type': 'hidden', 'name': 'ICSID'})['value']
   print "Browsing key found: %s" % self.ICSID
コード例 #50
0
ファイル: ApexScrape.py プロジェクト: johnnymo87/Vending
 def getCoils(self, device):
     pkgQty = self.getPkgQty(self.devices[device][1])
     print 'Getting coils for', device
     parts = {}
     params = OrderedDict((('comId', self.devices[device][0]),
                           ('siteId', self.devices[device][1]),
                           ('requestId', self.devices[device][2])))
     try:
         request = self.s.post(
             "https://fastsolutions.mroadmin.com/Apex-Device/devicePOGAction_detailPOG.action",
             params=params)
     except:
         request = self.relog(
             "https://fastsolutions.mroadmin.com/Apex-Device/devicePOGAction_detailPOG.action",
             params)
     soup = BeautifulSoup.BeautifulSoup(request.content)
     self.branch = str(
         soup.find('div', {
             'id': 'head_company_name'
         }).contents[0][-5:])
     coils = int(soup.find('input', {'id': 'deviceBinCount'})['value'])
     lockers = int(soup.find('input', {'id': 'lockersCount'})['value'])
     for i in range(coils):
         row = soup.find('tr', {'id': 'tr' + str(i)})
         description = str(
             self.html_parser.unescape(row.contents[3].contents[0].strip()))
         if row.contents[5].contents[1].contents:
             SKU = re.search(
                 '[\d\w\[\]-]+',
                 str(row.contents[5].contents[1].contents[0].strip()))
             QTY = re.search('[\d.]+', str(row.contents[7].contents[0]))
             MAX = re.search('[\d.]+', str(row.contents[11].contents[0]))
             MIN = re.search('[\d.]+', str(row.contents[13].contents[0]))
             SKU, QTY, MAX, MIN = SKU.group(), int(float(QTY.group())), int(
                 float(MAX.group())), int(float(MIN.group()))
             QTY, MAX, MIN = map(lambda x: x * pkgQty[SKU], [QTY, MAX, MIN])
             if SKU in parts:
                 QTY, MAX, MIN = map(lambda x: x[1] + parts[SKU][x[0]],
                                     [('QTY', QTY), ('MAX', MAX),
                                      ('MIN', MIN)])
             parts[SKU] = {
                 'description': description,
                 'QTY': QTY,
                 'MAX': MAX,
                 'MIN': MIN
             }
         else:
             print 'Data missing from position ' + str(
                 int(float(row.contents[1].contents[0])))
     for i in range(lockers):
         row = soup.find('tr', {'id': 'lockerTr' + str(i)})
         description = str(
             self.html_parser.unescape(row.contents[3].contents[0].strip()))
         if row.contents[5].contents[1].contents:
             SKU = re.search(
                 '[\d\w\[\]-]+',
                 str(row.contents[5].contents[1].contents[0].strip()))
             QTY = re.search('[\d.]+', str(row.contents[7].contents[0]))
             MAX = re.search('[\d.]+', str(row.contents[9].contents[0]))
             MIN = re.search('[\d.]+', str(row.contents[11].contents[0]))
             SKU, QTY, MAX, MIN = SKU.group(), int(float(QTY.group())), int(
                 float(MAX.group())), int(float(MIN.group()))
             QTY, MAX, MIN = map(lambda x: x * pkgQty[SKU], [QTY, MAX, MIN])
             if SKU in parts:
                 QTY, MAX, MIN = map(lambda x: x[1] + parts[SKU][x[0]],
                                     [('QTY', QTY), ('MAX', MAX),
                                      ('MIN', MIN)])
             parts[SKU] = {
                 'description': description,
                 'QTY': QTY,
                 'MAX': MAX,
                 'MIN': MIN
             }
         else:
             print 'Data missing from position ' + str(
                 int(float(row.contents[1].contents[0])))
     return parts
コード例 #51
0
excel = "C:\Users\succful\Desktop\\nicetest.xlsx"
os.popen("md D:\\Download\>nul 2>nul")

browser = spynner.Browser()
browser.create_webview()
browser.set_html_parser(pyquery.PyQuery)

#browser.load("https://support.hp.com/us-en/drivers/selfservice/hp-elite-slice/12710078",load_timeout=120)
#browser.load("https://support.hp.com/cn-zh/drivers/selfservice/hp-elite-slice/12710078",load_timeout=120)
#open("Test.html", 'w+').write(browser.html.encode("utf-8"))
f = open("D:\Download\Test.html", 'w+')
f.write(browser.html.encode("utf-8"))
f.close()
#browser.close()

soups = BeautifulSoup.BeautifulSoup(open("D:\Download\Test.html"))
tag_button = soups.findAll(
    'button', {"class": "hidden-lg button button-sm primary hpdiaButton"})
num = len(tag_button)
'''
ntfile = open("ntname.txt","r")
ntNames = ntfile.readlines()
ntfile.close()
ntName=ntName.strip("\n")
'''
data = xlrd.open_workbook(excel)
table = data.sheet_by_name(u'sheet')
nrows = table.nrows
for i in range(1, nrows):
    ntName = table.cell(i, 0).value
    i = 0
コード例 #52
0
 def get_home(self):
   r = self.opener.open(self.scrape_url)
   page = r.read()
   return BeautifulSoup.BeautifulSoup(page)
コード例 #53
0
def LISTMOVIES(murl, name, index, page=1):
    turl = murl

    totalMoviesToLoad = settings.getNoOfMoviesToLoad()

    dialogWait = xbmcgui.DialogProgress()

    ret = dialogWait.create('Please wait until [Movies] are cached.')
    loadedLinks = 0
    totalLinks = totalMoviesToLoad
    remaining_display = 'Movies loaded :: [B]' + str(
        loadedLinks) + ' / ' + str(totalLinks) + '[/B].'
    dialogWait.update(0, '[B]Will load instantly from now on[/B]',
                      remaining_display)
    xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)")

    quality = None
    hindiMovie = False
    year = None
    pagesScanned = 0
    while ((pagesScanned < 5) and (loadedLinks <= totalMoviesToLoad)):
        purl = turl
        if int(page) > 1:
            purl = turl + "?paged=" + str(page)
        link = main.OPENURL(purl)
        soup = BeautifulSoup.BeautifulSoup(link).findAll('item')
        for item in soup:
            quality = ''
            hindiMovie = False
            year = ''

            name = item.title.text
            url = item.comments.text.replace('#comments', '')
            for category in item.findAll('category'):
                if category.text == 'Hindi Movies':
                    #print item
                    hindiMovie = True
                elif re.search('DVD', category.text, flags=re.I):
                    quality = ' [COLOR red][DVD][/COLOR] '
                elif re.search('/*BluRay/*', category.text, flags=re.I):
                    quality = ' [COLOR red][HD][/COLOR] '
                elif re.search('[1-2][0,9][0-9][0-9]',
                               category.text,
                               flags=re.I):
                    year = category.text
                if dialogWait.iscanceled(): return False
            if dialogWait.iscanceled(): return False
            if hindiMovie:
                pagesScanned = 0
                main.addDirX(name + quality,
                             url,
                             constants.SOMINAL_LOADVIDEOS,
                             '',
                             searchMeta=True,
                             metaType='Movies',
                             year=year)
                loadedLinks = loadedLinks + 1
                percent = (loadedLinks * 100) / totalLinks
                remaining_display = 'Movies loaded :: [B]' + str(
                    loadedLinks) + ' / ' + str(totalLinks) + '[/B].'
                dialogWait.update(percent,
                                  '[B]Will load instantly from now on[/B]',
                                  remaining_display)
                if loadedLinks >= totalLinks:
                    print 'BREAKING'
                    break
                if dialogWait.iscanceled(): return False
        if dialogWait.iscanceled(): return False
        page = str(int(page) + 1)
        pagesScanned = pagesScanned + 1
    dialogWait.close()
    del dialogWait

    main.addDir('[COLOR blue]Next[/COLOR]',
                murl,
                constants.SOMINAL_LISTMOVIES,
                art + '/next.png',
                index=index,
                page=str(page))
    xbmcplugin.setContent(int(sys.argv[1]), 'Movies')
    main.setSeasonView()
コード例 #54
0
 def _confirm_long_listing(self, ICStateNum):
   # basically, press "yes" to confirm showing more than 100 results
   
   r = self.opener.open(self.scrape_url)
   page = r.read()
   return BeautifulSoup.BeautifulSoup(page)
コード例 #55
0
'''

RE_TM = re.compile('Theresa May')
RE_LF = re.compile('Lynne Featherstone')
TM = 'Rt Hon Theresa May MP'
LF = 'Lynne Featherstone MP'

baseurl = 'http://www.equalities.gov.uk'
baselink = '/ministers/speeches-1.aspx'

# get speeches

speechlinks = []

html = scraperwiki.scrape(baseurl + baselink)
soup = BeautifulSoup.BeautifulSoup(html)
table = soup.find('table', {'summary': 'Speeches'})
data = table.findAll('td')
for contents in data:
    a = contents.find('a')
    if a: speechlinks.append(a['href'])

for link in speechlinks:
    if link.split('.')[-1] == 'doc':
        continue
    record = {}
    link = link.replace(u'\u2019', u'%92')
    link = baseurl + link
    print link
    record['department'] = 'Government Equalities Office'
    record['permalink'] = link
コード例 #56
0
#-*- coding: utf-8 -*-

import urlopen as urlopen
import bs4 as BeautifulSoup

url = "https://www.rottentomatoes.com/"
html = urlopen(url)
source = html.read()  # 바이트코드 type으로 소스를 읽는다.
html.close()  # urlopen을 진행한 후에는 close를 한다.

soup = BeautifulSoup(
    source, "html5lib"
)  # 파싱할 문서를 BeautifulSoup 클래스의 생성자에 넘겨주어 문서 개체를 생성, 관습적으로 soup 이라 부름
table = soup.find(id="Top-Box-Office")
movies = table.find_all(class_="middle_col")

for movie in movies:
    title = movie.get_text()
    print(title)
    link = movie.a.get('href')
    url = 'https://www.rottentomatoes.com' + link
    print(url)
コード例 #57
0
import urllib
import BeautifulSoup

html = '<div><span><a href=http://naver.com>naver.com</a></span></div>'
soup = BeautifulSoup.BeautifulSoup(html)
print soup.prettify()

data = urllib.urlopen(
    'http://comic.naver.com/webtoon/list.nhn?titleId=20853&weekday=fri')
soup = BeautifulSoup.BeautifulSoup(data)
cartoons = soup.findAll('td', attrs={'class': 'title'})

title = cartoons[0].find('a').text
link = cartoons[0].find('a')['href']

print soup.prettify()
print title, link
'''
print cartoons[1].find('a').text
print cartoons[0]
'''
コード例 #58
0
						hash_sha1 = hashlib.sha1(hash_cmd).hexdigest()
						hash_sha256 = hashlib.sha256(hash_cmd).hexdigest()

						self.SendMsg(canal, banner + '0,1[4 HASH 0]14 MD5 4=>15 {}'.format(str(hash_md5)))
						self.SendMsg(canal, banner + '0,1[4 HASH 0]14 SHA1 4=>15 {}'.format(str(hash_sha1)))
						self.SendMsg(canal, banner + '0,1[4 HASH 0]14 SHA256 4=>15 {}'.format(str(hash_sha256)))

					except:
						self.SendMsg(canal, banner + '0,1 Por favor, use:15 ' + self.prefix + 'hash <senha>')

				if command[0] == 'hashkill':
					try:
						hashkill_hash = command[1]
						self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]14 Checando:15 {} '.format(str(hashkill_hash)))
						r = requests.get('http://hashtoolkit.com/reverse-hash?hash=' + str(hashkill_hash))
						soup = BeautifulSoup.BeautifulSoup(r.text)
						hash_type = soup.tbody.td.text
						hashkill_list = []

						for result in soup.findAll('td', {'class':'res-text'}):
							hashkill_resolved = result.span.text
							hashkill_list.append(str(hashkill_resolved))
						if len(hashkill_list) == 1:
							for hashes in hashkill_list:
								self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]14 Encontrado 4=>9 {} 4=>14 ({}) '.format(str(hashes), str(hash_type)))
						else:
							self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]4 Hash não encontrada. ')
					except:
						self.SendMsg(canal, banner + '0,1[4 HASHKILL 0]4 Hash não encontrada. ')

				if command[0] == 'hex':
コード例 #59
0
def get_historical_low_price_from_sina_history(html_info):
    soup = BeautifulSoup.BeautifulSoup(html_info)
    data = soup.findAll('tr')[1:]
    price_list = [float(i.findAll('td')[8].text) for i in data]
    return list(reversed(price_list))
コード例 #60
0
__name__ = "vutsuak"

import urllib2
import BeautifulSoup as bs4

site = "http://www.cnn.com/2016/02/10/world/vancouver-island-human-foot/index.html"
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site, headers=hdr)
page = urllib2.urlopen(req)
html = urllib2.urlopen(site)
soup = bs4.BeautifulSoup(html)
content = ""
author = ""
date = ""

t = soup.title.string.split(" ")[0:6]
title = ""
for i in range(len(t)):
    title += t[i] + " "
l = []
for node in soup.findAll('p'):
    l.append(''.join(node.findAll(text=True)))

for i in range(7, len(l)):
    content += l[i] + "\n"
author = l[3]
date = l[5]
print title
print author
print date
print content