Python BeautifulSoup.find Examples, lib.BeautifulSoup.BeautifulSoup.find Python Examples

Example #1

0

Show file

File: exercise_page.py Project: OpenDSA/OpenDSA-devserver

 def parse_response(self):
     soup                = BeautifulSoup(self.response)
     
     head                = soup.find("head")
     
     self.max_points     = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0))
     
     if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted":
         self.is_accepted= True
     
     meta_title          = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"})
     if meta_title:
         self.meta["title"]  = meta_title
     else:
         title               = soup.find("title")
         if title:
             self.meta["title"]  = title.contents
     
     self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "")
     
     points              = _get_value_from_soup(head, "meta", "value", {"name": "points"})
     if points != None:
         self.points     = int(points)
         self.is_graded  = True
         self.is_accepted= True
     
     exercise_div        = soup.body.find("div", {"id": "exercise"})
     
     if exercise_div != None:
         self.content    = exercise_div.renderContents()
     else:
         self.content    = soup.body.renderContents()

Example #2

0

Show file

File: models.py Project: kahaeia1/a-plus

    def render(self):
        # TODO: fix and enable caching
        # content         =  cache.get(self.content_url)
        content = None

        url = self.content_url

        # If the page is not cached, retrieve it
        if content == None:
            opener      = urllib2.build_opener()
            content     = opener.open(url, timeout=5).read()
            
            # Save the page in cache
            # cache.set(self.content_url, content)
        
        soup            = BeautifulSoup(content)

        # TODO: Disabled. Add GET parameter support and enable.
        # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
        #for tag in soup.findAll('a', href=True):
        #    tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
        
        # If there's no element specified, use the BODY. 
        # Otherwise find the element with given id.
        if self.element_id == "":
            html        = soup.find("body").renderContents()
        else:
            html        = str(soup.find(id=self.element_id))
        
        return html

Example #3

0

Show file

File: models.py Project: OpenDSA/OpenDSA-devserver

 def render(self):
     content         =  cache.get(self.content_url) 
     
     # If the page is not cached, retrieve it
     if content == None:
         opener      = urllib2.build_opener()
         content     = opener.open(self.content_url, timeout=5).read()
         
         # Save the page in cache
         cache.set(self.content_url, content)
     
     soup            = BeautifulSoup(content)
     
     # Make links absolute, quoted from http://stackoverflow.com/a/4468467:
     for tag in soup.findAll('a', href=True):
         tag['href'] = urlparse.urljoin(self.content_url, tag['href'])
     
     # If there's no element specified, use the BODY. 
     # Otherwise find the element with given id.
     if self.element_id == "":
         html        = soup.find("body").renderContents()
     else:
         html        = str(soup.find(id=self.element_id))
     
     return html

Example #4

0

Show file

File: parsers.py Project: mshafrir/awardr

    def parse(property_id, ratecode='SPGCP'):
        valid_property = False
        hotel_props = {'id': property_id}

        property_url = "%s?propertyID=%s" % (starwood_url, property_id)
        logging.debug("Property URL: %s" % property_url)
        starwood_response = urlfetch.fetch(url=property_url, deadline=10)
        if starwood_response:
            try:
                soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'})
            except:
                soup = None

            if soup:
                try:
                    hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip()
                    hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1])

                    valid_property = True
                except:
                    pass

                if valid_property:
                    hotel_props['address'] = StarwoodParser.parse_address(soup)
                    #hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent"))
                    hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src']))

        return valid_property and hotel_props or None

Example #5

0

Show file

File: mitbbs.py Project: cylonbrain/FullTextRss

def handler(sock, url):
    htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') 
    soup = BeautifulSoup(htmlsource)
    content = soup.find("td",{"class":"jiawenzhang-type"})
    if content is None:
        return "content not found"
    return unicode(content)

Example #6

0

Show file

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div', {'id': 'ires'})
    if div_filter:
        contents = div_filter.findAll('li', {'class': 'g'})
        return contents
    return None

Example #7

0

Show file

File: bot.py Project: bordanton/Youth

def fetch_trains(place_from, place_to, date):  
    key = 'trains_' + place_from + '_' + place_to + '_' + str(date)
    data = memcache.get(key) #@UndefinedVariable
    if data != None:
        return data
      
    params = {'fromName': place_from,
              'toName': place_to,
              'when': utils.date_serialize(date),
              'search_type': 'suburban'}
    url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params)
    response = urlfetch.fetch(url)
    html = response.content
    soup = BeautifulSoup(html)
    list_node = soup.find("ul", { "class" : "b-holster b-search-result" })
    if list_node != None:
        regex = re.compile(r'<.*?>')
        b_nodes = list_node.findAll("b")
        result = []
        for b_node in b_nodes:
            data = regex.split(b_node.renderContents())
            try:
                time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data]
                result.append(TrainTiming(time[0], time[1]))
            except:
                pass
        memcache.add(key, result, 60*60)  #@UndefinedVariable
        return result

Example #8

0

Show file

File: powerapple.py Project: cylonbrain/FullTextRss

def handler(sock, url):
    htmlsource=sock.read()
    soup = BeautifulSoup(htmlsource)
    content = soup.find(id=re.compile("postmessage_\d+"),name="td")
    if content is None:
        return "failed to read content"
    return unicode(content)

Example #9

0

Show file

File: webpage_splitter.py Project: nava45/gpage_crawler

def get_organic_data(html_data):
    bs = BeautifulSoup(str(html_data))
    div_filter = bs.find('div',{'id':'ires'})
    if div_filter:
        contents = div_filter.findAll('li',{'class':'g'})
        return contents
    return None

Example #10

0

Show file

File: test_helpers.py Project: JElbourne/PubCart

 def assert_no_error_message_in_response(self, response):
     """Check that response has no error messages."""
     soup = BeautifulSoup(response)
     el = soup.find("p", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))
     el = soup.findAll("label", "alert-error")
     if el:
         self.fail("error message found in response unexpectedly: {}".format(el.contents))

Example #11

0

Show file

File: yahoo.py Project: mshafrir/Rotoist

def parse_matchup(league, team_id, week=None):
	team_info = {}
	
	params = {'mid1': team_id}
	if week:
		params.update({'week': week})
		
	matchup_url = build_url(league_id=league.id, page='matchup', params=params, access_code=league.access_code)
	soup = BeautifulSoup(urlfetch.fetch(matchup_url).content).find('div', attrs={'id': 'yspmain'})
	
	if True: #try:
		week = int(soup.find('div', attrs={'id': 'matchupweeksubnav'}).find('li', 'current selected').find('a').contents[0])
	else: #except:
		week = 0
		
	matchup_soup = soup.find('div', attrs={'id': 'matchup'})

	team_name = matchup_soup.find('h2').find('a')['title']
	lineup_soup = matchup_soup.find('table', attrs={'id': 'statTable1'}).find('tbody')
	
	lineup = []
	for lineup_row in lineup_soup.findAll('tr')[:-1]:
		position = str(lineup_row.find('td').contents[0].strip())

		player_cell = lineup_row.find('td', 'player')
			
		try:
			player = player_cell.find('div').find('a').contents[0].strip()
		except:
			player = None
			
		try:
			player_status = player_cell.find('div', 'detail').find('span', 'status').contents[0].strip()
		except:
			player_status = None
			
		opp = str(lineup_row.find('td', 'opp').contents[0].strip()).replace('&nbsp;', '')
		
		projected_points = float(lineup_row.find('td', 'stat wide').contents[0])
		
		lineup.append({'position': position, 'player': player, 'status': player_status, 'projected': projected_points, 'opp': opp != 'Bye' and opp or None})
		
	return {'name': team_name, 'lineup': lineup, 'week': week}

Example #12

0

Show file

def parse_organic_contents(raw_content, organic_pos):
    data_dict = {}
    data_dict['position'] = organic_pos

    b = BeautifulSoup(raw_content)
    rtitle = b.find('a')
    headline = p.sub('', str(rtitle))
    data_dict['title'] = headline

    display_url = parse_display_url(str(raw_content))
    data_dict['display_url'] = display_url

    rhref = b.find('a', href=True)
    url = str(rhref['href'])
    data_dict['url'] = ul.unquote(url)

    rtext = b.findAll('div', {'class': 's'})
    text = p.sub('', str(rtext))
    data_dict['text'] = text.replace(']', '').replace('[', '')
    return data_dict

Example #13

0

Show file

File: webpage_parser.py Project: nava45/gpage_crawler

def parse_organic_contents(raw_content,organic_pos):
    data_dict = {}
    data_dict['position'] = organic_pos

    b = BeautifulSoup(raw_content)
    rtitle = b.find('a')
    headline = p.sub('',str(rtitle))
    data_dict['title'] = headline

    display_url = parse_display_url(str(raw_content))
    data_dict['display_url'] = display_url

    rhref=b.find('a',href=True)
    url = str(rhref['href'])
    data_dict['url'] = ul.unquote(url)

    rtext=b.findAll('div',{'class':'s'})
    text=p.sub('',str(rtext))
    data_dict['text'] = text.replace(']','').replace('[','')
    return data_dict

Example #14

0

Show file

File: adtaily2csv.py Project: kosciak/kosciak-misc

def parse_page(writer, catalogue, page=1):
    print 'Parsing page %s' % page
    
    url = urllib.urlopen(URL % (catalogue, page))
    soup = BeautifulSoup(url)
    
    table = soup.find('table', attrs={'class': 'snippets'})
    for tr in table.findAll('tr'):
        # get name of the page
        name = tr.td.h4.a.string
        
        # get URL of the page
        url = tr.td.h4.a['href'].encode('utf-8')
        
        #get stats info
        stats = '?'
        stats_element = tr.find('p', attrs={'class': 'Stats'})
        if stats_element:
            stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '')
            if stats == 'wtrakc': 
                stats = '?'
        
        # get price
        price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12]
        
        # calculate CPM
        cpm = '?'
        try:
            cpm = (float(price)*30) / int(stats) * 1000
        except:
            cpm = '?'
        
        # write to the file
        row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')]
        print row
        writer.writerow(row)
    
    # find last page of the catalogue
    anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+'))
    if not anchors:
        return
    
    pages = []
    for anchor in anchors:
        number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1)
        pages.append(int(number))

    pages.sort()
    last = pages[-1]
    
    # parse next page if exists
    if last > page:
        next = page + 1
        parse_page(writer, catalogue, next)

Example #15

0

Show file

File: fantapy.py Project: mshafrir/FantaPy

	def load(self):
		league_soup = BeautifulSoup(urllib2.urlopen(league_url).read())
		if league_soup:
			self.name = League.name(league_soup)
			self.mb = MessageBoard(self)
			
			team_rows = league_soup.find('table', attrs={'id': 'standingstable'}).tbody.findAll('tr')
			teams = [Team(self, team_id) for team_id in xrange(1,2)] # xrange(1, len(team_rows) + 1)]
			for team in teams:
				print "%s, %s, \"%s\" %s\n" % (team.name, team.record, team.smack, team.roster)
			'''

Example #16

0

Show file

File: test.py Project: Xmister/rtlmost-xbmc

def get_shows():
	"""docstring for get_shows"""
	html = retrieve_url(BASE_URL)
	soup = BeautifulSoup(html, fromEncoding="utf-8")
	#print soup
	#print "Autómánia"
	showsHtml = soup.find(id="topnav04-ul").findAll("li")
	shows = []
	for show in showsHtml:
		shows.append({"title" : show.a.string, "url" : show.a['href']})
	return shows

Example #17

0

Show file

File: yahoo.py Project: mshafrir/Rotoist

def basic_league_info(league_id, access_code=None):
	league_url = build_url(league_id=league_id, access_code=access_code)
	league_soup = BeautifulSoup(urlfetch.fetch(url=league_url).content).find('div', attrs={'id': 'yspmain'})

	if league_soup.find('div', attrs={'class': 'errors'}):
		valid_league = False
	else:
		valid_league = True
	
	if valid_league:
		league_name = str(league_soup.find('h1').contents[0].strip())
		try:
			teams_count = len(league_soup.find('table', attrs={'id': 'standingstable'}).find('tbody').findAll('tr'))
		except:
			teams_count = 0
	
		return {'name': league_name, 'teams_count': teams_count, 'errors': False}
		
	else:
		return {'errors': True}

Example #18

0

Show file

File: WeatherReport2.py Project: zuojie/KKT

	def getWeatherInfo(self, my_phone):
		for user in self.users:
			url = self.url + self.province_map[user.province.encode('gbk')] + '/' + self.city_map[user.city.encode('gbk')] + '.html' #构造查询URL
			#print url
			page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8')
			soup = BeautifulSoup(page)
			#print page.decode('utf-8').encode('gbk')
			city_body = soup.find('div', {'class': 'w365border city_body'})
			weather_info = city_body.findAll('div', {'class': 'weather_div'})
			self.sendSMS(my_phone, weather_info[1], user) #明天的天气
			self.sendSMS(my_phone, weather_info[2], user) # 后天的天气

Example #19

0

Show file

File: fantapy.py Project: mshafrir/FantaPy

	def __init__(self, league, team_id):
		team_url = "http://%s%s%s/%d?pak=%s" % (league.sport, YAHOO_FB, league.league_id, team_id, league.access_code)
		team_soup = BeautifulSoup(urllib2.urlopen(team_url).read()).find('div', attrs={'id': 'bd'})
		team_info_soup = team_soup.find('div', attrs={'id': 'teaminfo'})
		self.name = clean(team_info_soup.h1.em.contents[0])
		self.record = Team.parse_record(team_info_soup)
		try:
			self.smack = clean(team_info_soup.find('p', attrs={'id': 'smacktext'}).contents[0])
		except:
			self.smack = ''
		
		self.roster = Roster(league, team_id).players

Example #20

0

Show file

File: exercise_page.py Project: OpenDSA/OpenDSA-devserver

    def parse_response(self):
        soup = BeautifulSoup(self.response)

        head = soup.find("head")

        self.max_points = int(
            _get_value_from_soup(head, "meta", "value", {"name": "max-points"},
                                 0))

        if _get_value_from_soup(head, "meta", "value",
                                {"name": "status"}) == "accepted":
            self.is_accepted = True

        meta_title = _get_value_from_soup(head, "meta", "content",
                                          {"name": "DC.Title"})
        if meta_title:
            self.meta["title"] = meta_title
        else:
            title = soup.find("title")
            if title:
                self.meta["title"] = title.contents

        self.meta["description"] = _get_value_from_soup(
            head, "meta", "content", {"name": "DC.Description"}, "")

        points = _get_value_from_soup(head, "meta", "value",
                                      {"name": "points"})
        if points != None:
            self.points = int(points)
            self.is_graded = True
            self.is_accepted = True

        exercise_div = soup.body.find("div", {"id": "exercise"})

        if exercise_div != None:
            self.content = exercise_div.renderContents()
        else:
            self.content = soup.body.renderContents()

Example #21

0

Show file

File: sandbox.py Project: mshafrir/awardr

    def get(self):
        #
        self.response.headers['Content-Type'] = 'text/plain'
        br = create_mechanize()

        base_url = "https://secure.hilton.com%s"
        login_landing_url = base_url % "/en/hhonors/login/login.jhtml"

        br.open(login_landing_url)
        br.select_form(name="loginForm")
        br.form['Username'] = '******'
        br.form['password'] = '******'
        br.submit()

        soup = BeautifulSoup(br.response().read())
        session_id = soup.find('form', attrs={'name': 'logout'})['action'].split(';jsessionid=')[1]

        hotel_url = "http://doubletree.hilton.com/en/dt/hotels/index.jhtml;jsessionid=%s?ctyhocn=CHINPDT" % (session_id)
        self.response.out.write("%s\n\n" % hotel_url)
        br.open(hotel_url)

        #a = br.select_form(name="rewardSearch")
        br.form.set_all_readonly(False)
        br.form.find_control(name="flexCheckInDay", kind="list").value = ["3"]
        br.form.find_control(name="flexCheckInMonthYr", kind="list").value = ["December 2010"]
        #br.form.find_control(name="checkInDay", kind="list").value = ["3"]
        #br.form.find_control(name="checkInMonthYr", kind="list").value = ["December 2010"]
        #br.form.find_control(name="checkOutDay", kind="list").value = ["5"]
        #br.form.find_control(name="checkOutMonthYr", kind="list").value = ["December 2010"]
        br.form.find_control(name="los", kind="list").value = ["2"]
        br.form["isReward"] = "true"
        br.form["flexibleSearch"] = "true"
        br.form["source"] = "hotelResWidget"
        br.submit()

        self.response.out.write("%s\n\n" % br.geturl())

        br.select_form(name="loginForm")
        br.form['Username'] = '******'
        br.form['password'] = '******'
        br.submit()

        self.response.out.write("%s\n\n" % br.geturl())

        for form in br.forms():
            pass  # self.response.out.write("%s\n\n\n\n\n" % form)

        self.response.out.write("\n\n\n\n\n==============\n\n\n\n\n")

Example #22

0

Show file

File: default.py Project: Xmister/rtlmost-xbmc

def get_shows():
	"""docstring for get_shows"""
	html = retrieve_url(BASE_URL)
	# fix the f****d up encoding in original document
	html = re.sub("Medi.n WebAudit RTLcsoport rtlmost.hu", "", html)
	
	soup = BeautifulSoup(html)
	#print soup.prettify
	#print "Autómánia"
	showsHtml = soup.find(id="topnav04-ul").findAll("li")
	#remove the last item
	showsHtml.pop(len(showsHtml)-1)
	shows = []
	for show in showsHtml:
		shows.append({"title" : show.a.string, "url" : show.a['href']})
	return shows

Example #23

0

Show file

File: tasks.py Project: mshafrir/awardr

    def get(self):
        def valid_setcode(soup):
            try:
                top_msg_div = soup.find('div', attrs={'id': 'topMsgDiv'})
                if top_msg_div.find('span', attrs={'class': 'error'}) and bool(top_msg_div.find('p').contents[0].strip()):
                    return False
                else:
                    return True
            except:
                return True

        self.response.headers['Content-Type'] = 'text/plain'

        try:
            set_code = int(self.request.get('set_code', 0))
        except:
            set_code = None
        if StarwoodSetCode.get_by_key_name(StarwoodSetCode.calc_key_name(set_code)):
            self.response.out.write("SET code entity already created.")
            return

        try:
            hotel_id = int(self.request.get('hotel_id', 0))
        except:
            hotel_id = None

        name = None

        if set_code and hotel_id:
            check_in = date.today() + relativedelta(months=1)
            check_out = check_in + relativedelta(days=1)
            #url = "https://www.starwoodhotels.com/preferredguest/search/ratelist.html?corporateAccountNumber=%d&lengthOfStay=1&roomOccupancyTotal=001&requestedChainCode=SI&requestedAffiliationCode=SI&theBrand=SPG&submitActionID=search&arrivalDate=2010-09-15&departureDate=2010-09-16&propertyID=%d&ciDate=09/15/2010&coDate=09/19/2010&numberOfRooms=01&numberOfAdults=01&roomBedCode=&ratePlanName=&accountInputField=57464&foo=5232"
            url = "https://www.starwoodhotels.com/preferredguest/search/ratelist.html?arrivalDate=%s&departureDate=%s&corporateAccountNumber=%d&propertyID=%d" \
                    % (helper.date_to_str(check_in), helper.date_to_str(check_out), set_code, hotel_id)
            try:
                response = urlfetch.fetch(url, deadline=10)
            except DownloadError, details:
                logging.error("DownloadError: %s" % details)
                response = None

            if response:
                soup = BeautifulSoup(response.content)
                if valid_setcode(soup):
                    try:
                        name = str(soup.find('table', attrs={'id': 'rateListTable'}).find('tbody').find('tr').find('td', attrs={'class': 'rateDescription'}).find('p').contents[0].strip())
                    except:
                        name = None

Example #24

0

Show file

File: yahoo.py Project: mshafrir/Rotoist

def parse_matchup_score2(league, team_id):
	generic_matchup_url = build_url(league_id=league.id, page='matchup', access_code=league.access_code)
	try:
		soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content)
		matchup_path = soup.find('div', attrs={'class': 'scoreboard'}).find('td', attrs={'class': 'last'}).find('a')['href']
	except:
		matchup_path = None
	
	matchup_url = None
	if matchup_path:
		params = {}
		for p in matchup_path.split('?')[1].split('&'):
			k, v = p.split('=')
			params[k] = v
		matchup_url = build_url(league_id=league.id, page='matchup', params=params, access_code=league.access_code)
		
	return matchup_url

Example #25

0

Show file

File: Mall.py Project: zuojie/KKT

	def getRes(self, url):
		page = urllib2.urlopen(url).read().\
		decode('GBK').encode('utf-8')
		
		ret_info = []
		
		soup = BeautifulSoup(page)
		#获取最终检索页面的商品列表
		deal_table = soup.find('table', {'class': 'deals'})
		#first, 价格最低，假如不符合用户要求，那么下面的也不可能符合用户
		#要求，可直接返回
		min_price = deal_table.findAll('tr', {'class': 'first'}, limit = 1)[0]
		try:
			goods_name = str(min_price.h3.contents[0].string)
		except Exception, e:
			self.py_log.log("获取商品名称失败", self.py_log.get_file_name(), self.py_log.get_line())
			goods_name = "NULL"

Example #26

0

Show file

File: cron.py Project: mshafrir/Junkscast

    def web_segments(self):
        segment_props = []

        response = fetch_url("http://washington.cbslocal.com/audio-on-demand/the-sports-junkies/")
        if response and response.status_code == 200:
            page_soup = BeautifulSoup(response.content)
            try:
                segment_blocks = page_soup.find('div', {'class': 'cols'}).find('ul').findAll('li')
            except:
                segment_blocks = None

            if segment_blocks:
                segment_blocks.reverse()
                for segment_block in segment_blocks:
                    props = self.parse_web_segment(segment_block)
                    self.response.out.write("%s\n" % props)
                    segment_props.append(props)

        return segment_props

Example #27

0

Show file

File: cnbeta.py Project: cylonbrain/FullTextRss

def handler(content, url):
    m=re.search(".*?(\d+)\.htm",url)
    if m is None or len(m.groups())<1:
        return "Failed to parser url"
    
    com_url = "http://www.cnbeta.com/comment/normal/%s.html"%m.groups()[0]
    comment = ""
    try:
        sock = urllib2.urlopen(com_url)
        htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') 
        comment_soup = BeautifulSoup(htmlsource)
        comment = "".join("%s<br/>"%x.text for x in comment_soup.findAll("dd",{"class":"re_detail"}))
    except urllib2.HTTPError:
         comment = ""
    
    
    content_soup = BeautifulSoup(content,fromEncoding="gbk")
    content = content_soup.find("div",{"id":"news_content"})
    
    if content is None:
        return "content not found"
    result = u"%s<br/>Comment%s"%(content,comment)
    return unicode(result)

Example #28

0

Show file

File: engine.py Project: mikhailshilkov/Youth

def parse(html, points_array, steps_array):    
    soup = BeautifulSoup(html)
    routes = []
    route_index = 0
    while True:
        route_node = soup.find("div", { "id" : "ts_route_" + str(route_index) })
        if route_node == None:
            break;
        directions = []
        total_duration = 0
        steps = steps_array[route_index]
        points = points_array[route_index]
        for index in range(len(steps) - 1): 
            step_node = route_node.find(attrs = { "id" : "step_" + str(route_index) + "_" + str(index) + "_segment" })
    
            step = common.RouteStep()                
            
            if step_node != None:
                step.direction = get_node_text(step_node.find(attrs = { "class" : "dir-ts-direction" }))
                segment_text = get_nodes_text(step_node.findAll(attrs = { "class": "dirsegtext" }))                
                if segment_text != '':
                    if step.direction.find('Unknown') > 0: # Prevent 'Walk to Unknown' direction
                        step.direction = segment_text
                    else:
                        step.direction += ': ' + segment_text
                     
                step.addinfo = get_nodes_text(step_node.findAll(attrs = { "class" : re.compile('^dir-ts-addinfo.*') })).replace('(','').replace(')','')
                step.duration = parse_duration(step.addinfo)
                step.initial_duration = step.duration
                total_duration += step.duration
                
                transport_type = get_transport(step.direction)
                if transport_type == None or transport_type == 'Walk':
                    step.direction = clean_walk_direction(step.direction)
                else:
                    line_number = get_node_text(step_node.find(attrs = { "class" : "trtline" }))
                    step.service_interval = parse_service_interval(step.addinfo)
                    step.transport = common.Transport(transport_type, line_number, step.service_interval)
                    step.direction = _(step.transport.type)
                    step.transport.stops = parse_stops(step.addinfo)                
                    if step.transport.is_subway(): 
                        step.direction += utils.subway_color(' ' + _('line') + ' ' + str(step.transport.line_number), step.transport.line_number);
                    else:
                        step.direction += ' ' + _('number') + ' ' + step.transport.line_number            
                        
                step.start_name = clean_walk_direction(get_node_text(step_node.find('b')))
                if step_node.nextSibling != None:
                    arrive_node = step_node.nextSibling.find(text = re.compile('^Arrive.*'))
                    if arrive_node != None:
                        step.end_name = clean_walk_direction(get_node_text(arrive_node.nextSibling))
                
            start_point = points[steps[index]['depPoint']]
            end_point = points[steps[index]['arrPoint']]
            
            step.start_location = common.GeoPoint(start_point['lat'], start_point['lng'])
            step.end_location = common.GeoPoint(end_point['lat'], end_point['lng'])
            
            if not step.is_walk():
                directions.append(step)
        routes.append(common.Route(directions, 'google'))
        route_index += 1
  
    return routes

Example #29

0

Show file

File: snippet.py Project: szabo92/gistable

from lib.BeautifulSoup import BeautifulSoup

agent="""Sosospider+(+http://help.soso.com/webspider.htm)"""

blog_url = 'http://blog.sina.com.cn/s/articlelist_1517582220_0_1.html'
spider_handle = urllib2.urlopen(blog_url)
blog_content = spider_handle.read()
soup = BeautifulSoup(blog_content, fromEncoding='utf-8')
item_list = soup.findAll('span', {'class':'atc_title'})

urls = ['http://blog.csdn.net/heiyeshuwu/archive/2010/12/19/6085876.aspx']
#for item in item_list:
#    urls.append(item.a['href'])
    
for url in urls:
    request = urllib2.Request(url)
    request.add_header('User-Agent', agent)
    handle = urllib2.urlopen(request).read()
    article_soup = BeautifulSoup(handle, fromEncoding='utf-8')
    title = article_soup.find('h1',{'class':'title_txt'})
    content = article_soup.find('div',{'id':'sina_keyword_ad_area2'})
#    tmp = []
#    for c  in content.contents:
#        print type(c)
#        tmp.append(c.__str__('utf-8'))
    print url
    print title.contents
    print title.contents[2].replace('\t', '').replace('\r\n', '')
#    print ''.join(tmp)
    exit()

Example #30

0

Show file

File: base.py Project: lovejoy/KindleEar

 def Items(self):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content
     对于图片，mime,url,filename,content
     """
     cnt4debug = 0
     decoder = AutoDecoder()
     for section, url in self.feeds:
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         opener = URLOpener(self.host)
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             logging.error('err(%d) to fetch %s.' % (status_code,url))
             continue
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content)
         
         content =  self.preprocess(content)
         soup = BeautifulSoup(content)
         
         try:
             title = soup.html.head.title.string
         except AttributeError:
             logging.error('object soup invalid!(%s)'%url)
             continue
         
         title = self.processtitle(title)
         
         if self.keep_only_tags:
             body = Tag(soup, 'body')
             try:
                 if isinstance(self.keep_only_tags, dict):
                     self.keep_only_tags = [self.keep_only_tags]
                 for spec in self.keep_only_tags:
                     for tag in soup.find('body').findAll(**spec):
                         body.insert(len(body.contents), tag)
                 soup.find('body').replaceWith(body)
             except AttributeError: # soup has no body element
                 pass
         
         def remove_beyond(tag, next): # 鍐呭祵鍑芥暟
             while tag is not None and getattr(tag, 'name', None) != 'body':
                 after = getattr(tag, next)
                 while after is not None:
                     ns = getattr(tag, next)
                     after.extract()
                     after = ns
                 tag = tag.parent
         
         if self.remove_tags_after:
             rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
             for spec in rt:
                 tag = soup.find(**spec)
                 remove_beyond(tag, 'nextSibling')
         
         if self.remove_tags_before:
             tag = soup.find(**self.remove_tags_before)
             remove_beyond(tag, 'previousSibling')
         
         remove_tags = self.insta_remove_tags + self.remove_tags
         remove_ids = self.insta_remove_ids + self.remove_ids
         remove_classes = self.insta_remove_classes + self.remove_classes
         remove_attrs = self.insta_remove_attrs + self.remove_attrs
         for tag in soup.findAll(remove_tags):
             tag.extract()
         for id in remove_ids:
             for tag in soup.findAll(attrs={"id":id}):
                 tag.extract()
         for cls in remove_classes:
             for tag in soup.findAll(attrs={"class":cls}):
                 tag.extract()
         for attr in remove_attrs:
             for tag in soup.findAll(attrs={attr:True}):
                 del tag[attr]
         for tag in soup.findAll(attrs={"type":"text/css"}):
             tag.extract()
         for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
             cmt.extract
         
         if self.keep_image:
             self.soupbeforeimage(soup)
             for img in soup.findAll('img'):
                 imgurl = img['src']
                 if not imgurl.startswith('http') and not imgurl.startswith('www'):
                     imgurl = self.urljoin(url, imgurl)
                 imgresult = opener.open(imgurl)
                 imgcontent = imgresult.content if imgresult.status_code == 200 else None
                 if imgcontent:
                     imgtype = imghdr.what(None, imgcontent)
                     if imgtype:
                         imgmime = r"image/" + imgtype
                         if imgtype == 'jpeg':
                             fnimg = "%d.jpg" % random.randint(10000,99999999)
                         else:
                             fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                         img['src'] = fnimg
                         yield (imgmime, imgurl, fnimg, imgcontent)
         else:
             for img in soup.findAll('img'):
                 img.extract()
         
         self.soupprocessex(soup)
         content = soup.renderContents('utf-8').decode('utf-8')
         soup = None
         content =  self.postprocess(content)
         yield (section, url, title, content)

Example #31

0

Show file

File: base.py Project: lovejoy/KindleEar

 def fulltext(self, url, decoder):
     #因为图片文件占内存，为了节省内存，这个函数也做为生成器
     if self.fulltext_by_instapaper:
         url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)
     opener = URLOpener(self.host)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         logging.error('err(%d) to fetch %s.' % (status_code,url))
         return
     
     if self.page_encoding:
         content = content.decode(self.page_encoding)
     else:
         content = decoder.decode(content)
     
     content = self.preprocess(content)
     soup = BeautifulSoup(content)
     
     try:
         title = soup.html.head.title.string
     except AttributeError:
         logging.error('object soup invalid!(%s)'%url)
         return
         
     title = self.processtitle(title)
     soup.html.head.title.string = title
     
     if self.keep_only_tags:
         body = Tag(soup, 'body')
         try:
             if isinstance(self.keep_only_tags, dict):
                 self.keep_only_tags = [self.keep_only_tags]
             for spec in self.keep_only_tags:
                 for tag in soup.find('body').findAll(**spec):
                     body.insert(len(body.contents), tag)
             soup.find('body').replaceWith(body)
         except AttributeError: # soup has no body element
             pass
     
     def remove_beyond(tag, next): # 内联函数
         while tag is not None and getattr(tag, 'name', None) != 'body':
             after = getattr(tag, next)
             while after is not None:
                 ns = getattr(tag, next)
                 after.extract()
                 after = ns
             tag = tag.parent
     
     if self.remove_tags_after:
         rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
         for spec in rt:
             tag = soup.find(**spec)
             remove_beyond(tag, 'nextSibling')
     
     if self.remove_tags_before:
         tag = soup.find(**self.remove_tags_before)
         remove_beyond(tag, 'previousSibling')
     
     remove_tags = self.insta_remove_tags + self.remove_tags
     remove_ids = self.insta_remove_ids + self.remove_ids
     remove_classes = self.insta_remove_classes + self.remove_classes
     remove_attrs = self.insta_remove_attrs + self.remove_attrs
     
     for tag in soup.findAll(remove_tags):
         tag.extract()
     for id in remove_ids:
         for tag in soup.findAll(attrs={"id":id}):
             tag.extract()
     for cls in remove_classes:
         for tag in soup.findAll(attrs={"class":cls}):
             tag.extract()
     for attr in remove_attrs:
         for tag in soup.findAll(attrs={attr:True}):
             del tag[attr]
     for tag in soup.findAll(attrs={"type":"text/css"}):
         tag.extract()
     for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)):
         cmt.extract
     
     if self.keep_image:
         self.soupbeforeimage(soup)
         for img in soup.findAll('img'):
             imgurl = img['src']
             if not imgurl.startswith('http') and not imgurl.startswith('www'):
                 imgurl = self.urljoin(url, imgurl)
             imgresult = opener.open(imgurl)
             imgcontent = imgresult.content if imgresult.status_code == 200 else None
             if imgcontent:
                 imgtype = imghdr.what(None, imgcontent)
                 if imgtype:
                     imgmime = r"image/" + imgtype
                     if imgtype == 'jpeg':
                         fnimg = "%d.jpg" % random.randint(10000,99999999)
                     else:
                         fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype)
                     img['src'] = fnimg
                     yield (imgmime, imgurl, fnimg, imgcontent)
     else:
         for img in soup.findAll('img'):
             img.extract()
     
     self.soupprocessex(soup)
     content = soup.renderContents('utf-8').decode('utf-8')
     soup = None
     
     yield (title, None, None, content)

Example #32

0

Show file

File: stuff.py Project: mikhailshilkov/Youth

def parse(html, points_arr, steps_arr):    
    soup = BeautifulSoup(html)
    routes = []
    route_index = 0
    while True:
        route_node = soup.find("div", { "id" : "ts_route_" + str(route_index) })
        if route_node == None:
            break;
        directions = []
        total_duration = 0
        steps = steps_arr[route_index]
        points = points_arr[route_index]
        for index in range(len(steps)): 
            step = route_node.find(attrs = { "id" : "step_" + str(route_index) + "_" + str(index) + "_segment" })
            
            direction = ''
            duration = ''
            addinfo = ''
            line_number = ''
            arrive =''
            addinfo_duration = ''
            
            if step != None:
                direction_node = step.find(attrs = { "class" : "dir-ts-direction" })
                if direction_node != None:
                    action_node = direction_node.find(attrs = { "class" : "action" })
                    location_node = direction_node.find(attrs = { "class" : "location" })
                    if action_node != None and location_node != None:
                        direction = str(action_node.text) + ' ' + str(location_node.text)
                    else:
                        direction = str(direction_node.renderContents())
                    if step.nextSibling != 'None':
                        arrive_node = step.nextSibling.find(text = re.compile('^Arrive.*'))
                        if arrive_node != None:
                            arrive = arrive_node.nextSibling.text
                addinfo_nodes = step.findAll(attrs = { "class" : re.compile('^dir-ts-addinfo.*') })
                addinfo = remove_html_tags(get_nodes_text(addinfo_nodes))
                
                addinfo_duration_m = re.search('Service runs every\s(\d+)\smin', addinfo)
                if addinfo_duration_m != None:
                    addinfo_duration = addinfo_duration_m.group(1) 
                duration = parse_duration(addinfo)
                total_duration += duration
                segtext_nodes = step.findAll(attrs = { "class": "dirsegtext" })
                direction += ', ' + get_nodes_text(segtext_nodes)
                line_number_node = step.find(attrs = { "class" : "trtline" })
                if line_number_node != None:
                    line_number = str(line_number_node.text)            
                
            start_point = points[steps[index]['depPoint']]
            end_point = points[steps[index]['arrPoint']]
            directions.append({
                               'direction': remove_html_tags(direction), 
                               'duration': duration,
                               'addinfo': addinfo,
                               'addinfo_duration': addinfo_duration,
                               'line_number' : line_number,
                               'arrive': arrive,
                               'start_location': { 'lat': start_point['lat'], 'lng': start_point['lng'] },
                               'end_location': { 'lat': end_point['lat'], 'lng': end_point['lng'] } 
                               })
        routes.append({'directions' : directions, 'total_duration': total_duration})
        route_index += 1
    return routes

Example #33

0

Show file

def view():		
	addon_handle = int(sys.argv[1])
	addon       = xbmcaddon.Addon()
	addonname   = addon.getAddonInfo('name')
	
	args = urlparse.parse_qs(sys.argv[2][1:])

	xbmcplugin.setContent(addon_handle, 'movies')

	cat=args.get('cat', None)
	page = args.get('page', None)
	link = args.get('link', None)	
	
	catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'},
				{'label':'Video Hot','id':'video/hot/'}]
	#play link
	if link!=None:
		link_video=link[0]
		if link_video.startswith(web_url):
			r = requests.get(link[0])
			html = r.text
			#xbmc.log(html.encode('utf-8'))
			soup = BeautifulSoup(html)
			video_src=soup.find('embed', attrs={'id':'zplayer'})
			video_flashvars=video_src.get('flashvars')
			args_video = urlparse.parse_qs(video_flashvars)
			link_video=args_video['file'][0]					
		xbmc.Player().play(link_video)
		return
	#Load cats
	if cat==None:
		for cat in catalogues:
			li = xbmcgui.ListItem(cat['label'])
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		xbmc.executebuiltin('Container.SetViewMode(501)')		 			
		xbmcplugin.endOfDirectory(addon_handle)
		return
	#Load noi dung cat
	if cat!=None:
		if page==None:
			page=1
		else:
			page=int(page[0])
		r = requests.get(web_url+cat[0]+str(page))
		html = r.text		
		xbmc.log(html.encode('utf-8'))
		soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)			
		data_List=soup.findAll('a',attrs={'class':'play'})
		#load item menu
		for item in data_List:			
			link_item=web_url+item.get('href')			
			if item.get('data-youtubeid')!='':
				link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid')
			img_item=item.find('img')
			img_src=img_item.get('src')
			img_alt=img_item.get('alt')
			
			li = xbmcgui.ListItem(img_alt)
			
			li.setThumbnailImage(img_src)
			li.setInfo(type='image',infoLabels="")					
			
			urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]})
			xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li)			
		
		#Tao nut next	
		li = xbmcgui.ListItem("Next")	
		urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1});
		xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True)	
		
		xbmc.executebuiltin('Container.SetViewMode(501)')
		#xbmc.executebuiltin("ClearSlideshow")		
		#xbmc.executebuiltin("SlideShow(,,notrandom)")		
		xbmcplugin.endOfDirectory(addon_handle)
		return
					
	xbmcplugin.endOfDirectory(addon_handle)

Example #34

0

Show file

File: models.py Project: fguy/openalive

    def save(self):
        self.title = bleach.clean(self.title)
        self.body = bleach.clean(self.body, tags=['code', 'a', 'p', 'span', 'div', 'strong', 'b', 'em', 'i', 'strike', 'cite', 'mark', 'small', 'blockquote', 'figure', 'figcaption', 'strong', 'sub', 'sup', 'img', 'iframe', 'br', 'pre', 'hr'], attributes=['style', 'title', 'src', 'frameborder', 'width', 'height', 'alt', 'href', 'target'], styles=['width', 'height', 'font-size', 'font-family', 'text-decoration', 'color', 'background-color', 'text-align', 'padding-left'])
        self.last_updated = datetime.datetime.now()

        has_image = False
        has_video = False
        self.image = None
        self.video = None
        
        soup = BeautifulSoup(self.body)
        img = soup.find('img')
        
        if img and hasattr(img, 'src') and re.search('tiny_mce/plugins/emotions', img['src'].lower()) is None:
            self.image = img['src']
            has_image = True
        
        iframes = soup.findAll('iframe')
        for item in iframes:
            if re.match('^http(s)?://(.+\.)?(youtube.com|youtu.be)/', item['src'].lower()) is not None:
                if self.video is None:
                    self.video = item['src']
                    has_video = True
            else:
                item.decompose()
        
        is_insert = not self.is_saved()
        
        excerpt = strip_tags(self.body).strip()
        self.excerpt = '%s...' % excerpt[:253] if len(excerpt) > 253 else excerpt
        
        diff = None
        if not is_insert:
            previous = self.__class__.get(self.key())
            Tag.decrease(previous.tags)
            if previous.body != self.body:
                diff = DIFFER.make_table(previous.body, self.body)
        db.run_in_transaction_options(xg_on, super(self.__class__, self).put)
        if is_insert:
            db.run_in_transaction_options(xg_on, self.author.increase_article_count)
            for item in self.category.get_path():
                db.run_in_transaction_options(xg_on, item.increase_article_count)
        elif diff:
            db.run_in_transaction_options(xg_on, ArticleHistory(article=self, diff=diff).put)
            
        key_name = str(self.key().id())
        if has_image:
            ImageArticle.get_or_insert(key_name, article=self)
        else:
            found = ImageArticle.gql('WHERE article = :1', self).get()
            if found != None:
                found.delete()
        if has_video:
            VideoArticle.get_or_insert(key_name, article=self)
        else:
            found = VideoArticle.gql('WHERE article = :1', self).get()
            if found != None:
                found.delete()
                
        if is_insert:
            Subscription.get_or_insert('%s-%s' % (self.key().id(), self.author.key()), article=self, user=self.author)

        return self

Example #35

0

Show file

def TPB(book=None, test=False):
    errmsg = ''
    provider = "TPB"
    host = lazylibrarian.CONFIG['TPB_HOST']
    if not host.startswith('http'):
        host = 'http://' + host

    providerurl = url_fix(host + "/s/?")

    cat = 0  # 601=ebooks, 102=audiobooks, 0=all, no mag category
    if 'library' in book:
        if book['library'] == 'AudioBook':
            cat = 102
        elif book['library'] == 'eBook':
            cat = 601
        elif book['library'] == 'magazine':
            cat = 0

    sterm = makeUnicode(book['searchterm'])

    page = 0
    results = []
    minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1
    next_page = True

    while next_page:

        params = {
            "q": book['searchterm'],
            "category": cat,
            "page": page,
            "orderby": "99"
        }

        searchURL = providerurl + "?%s" % urllib.urlencode(params)

        next_page = False
        result, success = fetchURL(searchURL)

        if not success:
            # may return 404 if no results, not really an error
            if '404' in result:
                logger.debug("No results found from %s for %s" %
                             (provider, sterm))
                success = True
            else:
                logger.debug(searchURL)
                logger.debug('Error fetching data from %s: %s' %
                             (provider, result))
                errmsg = result
            result = False

        if test:
            return success

        if result:
            logger.debug('Parsing results from <a href="%s">%s</a>' %
                         (searchURL, provider))
            soup = BeautifulSoup(result)
            # tpb uses a named table
            table = soup.find('table', id='searchResult')
            if table:
                rows = table.findAll('tr')
            else:
                rows = []

            if len(rows) > 1:
                rows = rows[1:]  # first row is headers
            for row in rows:
                td = row.findAll('td')
                if len(td) > 2:
                    try:
                        title = unaccented(
                            str(td[1]).split('title=')[1].split('>')[1].split(
                                '<')[0])
                        magnet = str(td[1]).split('href="')[1].split('"')[0]
                        size = unaccented(
                            td[1].text.split(', Size ')[1].split('iB')[0])
                        size = size.replace('&nbsp;', '')
                        mult = 1
                        try:
                            if 'K' in size:
                                size = size.split('K')[0]
                                mult = 1024
                            elif 'M' in size:
                                size = size.split('M')[0]
                                mult = 1024 * 1024
                            elif 'G' in size:
                                size = size.split('G')[0]
                                mult = 1024 * 1024 * 1024
                            size = int(float(size) * mult)
                        except (ValueError, IndexError):
                            size = 0
                        try:
                            seeders = int(td[2].text)
                        except ValueError:
                            seeders = 0

                        if minimumseeders < int(seeders):
                            # no point in asking for magnet link if not enough seeders
                            magurl = '%s/%s' % (host, magnet)
                            result, success = fetchURL(magurl)
                            if not success:
                                logger.debug('Error fetching url %s, %s' %
                                             (magurl, result))
                            else:
                                magnet = None
                                new_soup = BeautifulSoup(result)
                                for link in new_soup.findAll('a'):
                                    output = link.get('href')
                                    if output and output.startswith('magnet'):
                                        magnet = output
                                        break
                            if not magnet or not title:
                                logger.debug('Missing magnet or title')
                            else:
                                results.append({
                                    'bookid':
                                    book['bookid'],
                                    'tor_prov':
                                    provider,
                                    'tor_title':
                                    title,
                                    'tor_url':
                                    magnet,
                                    'tor_size':
                                    str(size),
                                    'tor_type':
                                    'magnet',
                                    'priority':
                                    lazylibrarian.CONFIG['TPB_DLPRIORITY']
                                })
                                logger.debug('Found %s. Size: %s' %
                                             (title, size))
                                next_page = True
                        else:
                            logger.debug('Found %s but %s seeder%s' %
                                         (title, seeders, plural(seeders)))
                    except Exception as e:
                        logger.error("An error occurred in the %s parser: %s" %
                                     (provider, str(e)))
                        logger.debug('%s: %s' %
                                     (provider, traceback.format_exc()))

        page += 1
        if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page:
            logger.warn(
                'Maximum results page search reached, still more results available'
            )
            next_page = False

    logger.debug("Found %i result%s from %s for %s" %
                 (len(results), plural(len(results)), provider, sterm))
    return results, errmsg

Example #36

0

Show file

File: uiucregisterscript.py Project: smiley325/uiucregistrationtools

br.submit()
br.select_form(nr=1)
req = br.click(type="submit", nr=1)
br.open(req)
br.select_form(nr=1)
br.find_control(name="sel_subj", nr=1).value = [R_SUBJECT]
br["sel_crse"] = R_COURSE
response = br.submit()
pageSoup = BeautifulSoup(response.read())

# Scrape information from pageText

tableHeads = []
tableData = []

for row in pageSoup.find('table', {'class': "datadisplaytable"}).findAll('tr'):
    if len(row.findAll('th')) > 1 and not tableHeads:
        # This is a header row
        for elem in row.findAll('th'):
            tableHeads.append(re.sub(r'<.*?>', '', str(elem)))
    else:
        # This is a content row
        ct = []
        for elem in row.findAll('td'):
            ct.append(re.sub(r'<.*?>', '', str(elem)))
        tableData.append(ct)
        
idxSection = tableHeads.index('Sec')
idxRem = tableHeads.index('Rem')

sectionsFree = False

Example #37

0

Show file

File: DetermineMinSDK.py Project: droid-sec/qark

def determineMinSDK():
	"""
	Determines the minimum SDK version supported by the vulnerable application\n
	As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml
	"""
	#determine minimum supported versions
	common.minSdkVersion=0
	common.sdk = common.xmldoc.getElementsByTagName("uses-sdk")
	determineSdk=''

	if len(common.sdk)>0:
		if 'android:minSdkVersion' in common.sdk[0].attributes.keys():
			try:
				common.minSdkVersion = common.sdk[0].attributes['android:minSdkVersion'].value
				logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion))
			except Exception as e:
				common.logger.error("Something went wrong trying to determine the version from the manifest: " + str(e))



	if common.minSdkVersion==0:
		if common.source_or_apk==2:
			common.minSdkVersion=findGradle()
			if common.minSdkVersion==0:
				common.logger.info("We were unable to find the minimum SDK version in your source.")
				determineSdk='m'
			else:
				logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion))
		else:
			common.compare(common.sdk.length,1,common.config.get('qarkhelper', 'USESDK_MISS'), 'false')
			print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN')
			while True:
				determineSdk=raw_input("Which option would you prefer? (P)lay, (M)anual")
				if determineSdk.lower() in ('p','m'):
					break;
				else:
					determineSdk=raw_input("Please enter either (p) or (m):")

		if determineSdk.lower() == 'p':
			#get package name from manifest if possible
			#make call to Play store
			#determine API version from https://play.google.com/store/apps/details?id=<package name>
			# will need to adjust the sdk[0] value for the checks below
			for a in common.xmldoc.getElementsByTagName('manifest'):
				if 'package' in a.attributes.keys():
					print common.config.get('qarkhelper', 'PACK_FOUND')
					package_name=a.attributes['package'].value
					print package_name
				else:
					package_name=raw_input(common.config.get('qarkhelper', 'NO_PACK_NAME'))

			try:
				logger.info(common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION'))
				play_url="https://play.google.com/store/apps/details?id="
				play_url+=package_name
				print play_url
				page=urllib2.urlopen(play_url)
				html=BeautifulSoup(page.read())
				play_version=html.find(itemprop="operatingSystems")
				plat_version=re.findall('\d+.\d+', play_version.contents[0])
				if plat_version:
					plat_version=[str(item) for item in plat_version]
					api_plat_map=[]
					api_plat_map.append(['1','1.0'])
					api_plat_map.append(['2','1.1'])
					api_plat_map.append(['3','1.5'])
					api_plat_map.append(['4','1.6'])
					api_plat_map.append(['5','2.0'])
					api_plat_map.append(['6','2.0.1'])
					api_plat_map.append(['7','2.1'])
					api_plat_map.append(['8','2.2'])
					api_plat_map.append(['9','2.3'])
					api_plat_map.append(['10','2.3.3'])
					api_plat_map.append(['11','3.0'])
					api_plat_map.append(['12','3.1'])
					api_plat_map.append(['13','3.2'])
					api_plat_map.append(['14','4.0'])
					api_plat_map.append(['15','4.0.3'])
					api_plat_map.append(['16','4.1'])
					api_plat_map.append(['17','4.2'])
					api_plat_map.append(['18','4.3']) #Webviews have critical vuln, no more patches from Google
					api_plat_map.append(['19','4.4'])
					api_plat_map.append(['20','4.4']) # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes
					api_plat_map.append(['21','5.0'])
					api_plat_map.append(['22','5.1']) # This is latest version, we'll assume this for newer, until update
					#TODO - double check this, adding 5.1 may have broken it
					for a in api_plat_map:
						if StrictVersion(str(plat_version[0]))>=StrictVersion(str(a[1])):
							common.minSdkVersion=a[0]
					logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion))
					manual=raw_input(common.config.get('qarkhelper', 'SDK_VALUE_MANUAL'))
				else:
					print common.config.get('qarkhelper', 'CANT_DET_PLAY')
					#BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken
			except HTTPError, e:
				print str(e);
				logger.error(common.config.get('qarkhelper', 'MIN_SDK_PLAY_STORE_FAILED'))
		elif (determineSdk.lower()=='m' or common.minSdkVersion==0):
			#does not actually become 1, just needs a value, since it wasn't found, so we assume worst case
			print common.term.cyan + common.term.bold + str(common.config.get('qarkhelper','NO_MIN_SDK')).decode('string-escape').format(t=common.term)
			enterSdk = raw_input(common.config.get('qarkhelper','PROMPT_MIN_SDK'))
			if enterSdk.lower() == 'y':
				sdkinput=0
				while True:
					sdkinput = int(raw_input(common.config.get('qarkhelper', 'PROMPT_VER')+common.config.get('qarkhelper','MAX_API_VERSION')+common.config.get('qarkhelper','PROMPT_VER2')))
					if 0 < int(sdkinput) <= int(common.config.get('qarkhelper','MAX_API_VERSION')):
						common.minSdkVersion = int(sdkinput)
						break
			else:
				common.minSdkVersion = 7

Example #38

0

Show file

File: DetermineMinSDK.py Project: ziv0chou/qark

def determine_min_sdk():
    """
	Determines the minimum SDK version supported by the vulnerable application\n
	As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml
	"""
    #determine minimum supported versions
    common.minSdkVersion = 0
    common.sdk = common.xmldoc.getElementsByTagName("uses-sdk")
    determineSdk = ''

    if len(common.sdk) > 0:
        if 'android:minSdkVersion' in common.sdk[0].attributes.keys():
            try:
                common.minSdkVersion = common.sdk[0].attributes[
                    'android:minSdkVersion'].value
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
            except Exception as e:
                common.logger.error(
                    "Something went wrong trying to determine the version from the manifest: "
                    + str(e))

    if common.minSdkVersion == 0:
        if common.source_or_apk == 2:
            common.minSdkVersion = find_gradle()
            if common.minSdkVersion == 0:
                common.logger.info(
                    "We were unable to find the minimum SDK version in your source."
                )
                determineSdk = 'm'
            else:
                logger.info(
                    common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                    str(common.minSdkVersion))
        else:
            common.compare(common.sdk.length, 1,
                           common.config.get('qarkhelper', 'USESDK_MISS'),
                           'false')
            print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN')
            while True:
                determineSdk = raw_input(
                    "Which option would you prefer? (P)lay, (M)anual")
                if determineSdk.lower() in ('p', 'm'):
                    break
                else:
                    determineSdk = raw_input("Please enter either (p) or (m):")

        if determineSdk.lower() == 'p':
            #get package name from manifest if possible
            #make call to Play store
            #determine API version from https://play.google.com/store/apps/details?id=<package name>
            # will need to adjust the sdk[0] value for the checks below
            for a in common.xmldoc.getElementsByTagName('manifest'):
                if 'package' in a.attributes.keys():
                    print common.config.get('qarkhelper', 'PACK_FOUND')
                    package_name = a.attributes['package'].value
                    print package_name
                else:
                    package_name = raw_input(
                        common.config.get('qarkhelper', 'NO_PACK_NAME'))

            try:
                logger.info(
                    common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION'))
                play_url = "https://play.google.com/store/apps/details?id="
                play_url += package_name
                print play_url
                page = urllib2.urlopen(play_url)
                html = BeautifulSoup(page.read())
                play_version = html.find(itemprop="operatingSystems")
                plat_version = re.findall('\d+.\d+', play_version.contents[0])
                if plat_version:
                    plat_version = [str(item) for item in plat_version]
                    api_plat_map = []
                    api_plat_map.append(['1', '1.0'])
                    api_plat_map.append(['2', '1.1'])
                    api_plat_map.append(['3', '1.5'])
                    api_plat_map.append(['4', '1.6'])
                    api_plat_map.append(['5', '2.0'])
                    api_plat_map.append(['6', '2.0.1'])
                    api_plat_map.append(['7', '2.1'])
                    api_plat_map.append(['8', '2.2'])
                    api_plat_map.append(['9', '2.3'])
                    api_plat_map.append(['10', '2.3.3'])
                    api_plat_map.append(['11', '3.0'])
                    api_plat_map.append(['12', '3.1'])
                    api_plat_map.append(['13', '3.2'])
                    api_plat_map.append(['14', '4.0'])
                    api_plat_map.append(['15', '4.0.3'])
                    api_plat_map.append(['16', '4.1'])
                    api_plat_map.append(['17', '4.2'])
                    api_plat_map.append(
                        ['18', '4.3']
                    )  #Webviews have critical vuln, no more patches from Google
                    api_plat_map.append(['19', '4.4'])
                    api_plat_map.append(
                        ['20', '4.4']
                    )  # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes
                    api_plat_map.append(['21', '5.0'])
                    api_plat_map.append(
                        ['22', '5.1']
                    )  # This is latest version, we'll assume this for newer, until update
                    #TODO - double check this, adding 5.1 may have broken it
                    for a in api_plat_map:
                        if StrictVersion(str(
                                plat_version[0])) >= StrictVersion(str(a[1])):
                            common.minSdkVersion = a[0]
                    logger.info(
                        common.config.get('qarkhelper', 'MIN_SDK_VERSION') +
                        str(common.minSdkVersion))
                    manual = raw_input(
                        common.config.get('qarkhelper', 'SDK_VALUE_MANUAL'))
                else:
                    print common.config.get('qarkhelper', 'CANT_DET_PLAY')
                    #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken
            except HTTPError, e:
                print str(e)
                logger.error(
                    common.config.get('qarkhelper',
                                      'MIN_SDK_PLAY_STORE_FAILED'))
        elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0):
            #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case
            print common.term.cyan + common.term.bold + str(
                common.config.get('qarkhelper', 'NO_MIN_SDK')).decode(
                    'string-escape').format(t=common.term)
            enterSdk = raw_input(
                common.config.get('qarkhelper', 'PROMPT_MIN_SDK'))
            if enterSdk.lower() == 'y':
                sdkinput = 0
                while True:
                    sdkinput = int(
                        raw_input(
                            common.config.get('qarkhelper', 'PROMPT_VER') +
                            common.config.get('qarkhelper', 'MAX_API_VERSION')
                            + common.config.get('qarkhelper', 'PROMPT_VER2')))
                    if 0 < int(sdkinput) <= int(
                            common.config.get('qarkhelper',
                                              'MAX_API_VERSION')):
                        common.minSdkVersion = int(sdkinput)
                        break
            else:
                common.minSdkVersion = 7

Example #39

0

Show file

def parse_display_url(tag):
    bs = BeautifulSoup(tag)
    display_url = bs.find('cite')
    dis_url = p.sub('', str(display_url))
    return str(dis_url)