def parse_response(self): soup = BeautifulSoup(self.response) head = soup.find("head") self.max_points = int(_get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0)) if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted": self.is_accepted= True meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"}) if meta_title: self.meta["title"] = meta_title else: title = soup.find("title") if title: self.meta["title"] = title.contents self.meta["description"] = _get_value_from_soup(head, "meta", "content", {"name": "DC.Description"}, "") points = _get_value_from_soup(head, "meta", "value", {"name": "points"}) if points != None: self.points = int(points) self.is_graded = True self.is_accepted= True exercise_div = soup.body.find("div", {"id": "exercise"}) if exercise_div != None: self.content = exercise_div.renderContents() else: self.content = soup.body.renderContents()
def render(self): # TODO: fix and enable caching # content = cache.get(self.content_url) content = None url = self.content_url # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(url, timeout=5).read() # Save the page in cache # cache.set(self.content_url, content) soup = BeautifulSoup(content) # TODO: Disabled. Add GET parameter support and enable. # Make links absolute, quoted from http://stackoverflow.com/a/4468467: #for tag in soup.findAll('a', href=True): # tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def render(self): content = cache.get(self.content_url) # If the page is not cached, retrieve it if content == None: opener = urllib2.build_opener() content = opener.open(self.content_url, timeout=5).read() # Save the page in cache cache.set(self.content_url, content) soup = BeautifulSoup(content) # Make links absolute, quoted from http://stackoverflow.com/a/4468467: for tag in soup.findAll('a', href=True): tag['href'] = urlparse.urljoin(self.content_url, tag['href']) # If there's no element specified, use the BODY. # Otherwise find the element with given id. if self.element_id == "": html = soup.find("body").renderContents() else: html = str(soup.find(id=self.element_id)) return html
def parse(property_id, ratecode='SPGCP'): valid_property = False hotel_props = {'id': property_id} property_url = "%s?propertyID=%s" % (starwood_url, property_id) logging.debug("Property URL: %s" % property_url) starwood_response = urlfetch.fetch(url=property_url, deadline=10) if starwood_response: try: soup = BeautifulSoup(starwood_response.content).find(attrs={'id': 'propertyHighlight'}).find(attrs={'class': 'propertyContainer'}) except: soup = None if soup: try: hotel_props['name'] = unicode(soup.find("a", "propertyName").contents[0]).strip() hotel_props['category'] = int(str(soup.find("span", "spgCategory").contents[0]).split()[-1]) valid_property = True except: pass if valid_property: hotel_props['address'] = StarwoodParser.parse_address(soup) #hotel_props['awards'] = StarwoodParser.parse_starwood(soup.find("div", "tabsContentContainer").findAll("div", "tabContent")) hotel_props['image_url'] = str("http://www.starwoodhotels.com%s" % (soup.find("img", "propertyThumbnail")['src'])) return valid_property and hotel_props or None
def handler(sock, url): htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') soup = BeautifulSoup(htmlsource) content = soup.find("td",{"class":"jiawenzhang-type"}) if content is None: return "content not found" return unicode(content)
def get_organic_data(html_data): bs = BeautifulSoup(str(html_data)) div_filter = bs.find('div', {'id': 'ires'}) if div_filter: contents = div_filter.findAll('li', {'class': 'g'}) return contents return None
def fetch_trains(place_from, place_to, date): key = 'trains_' + place_from + '_' + place_to + '_' + str(date) data = memcache.get(key) #@UndefinedVariable if data != None: return data params = {'fromName': place_from, 'toName': place_to, 'when': utils.date_serialize(date), 'search_type': 'suburban'} url = 'http://m.rasp.yandex.ru/search?' + urllib.urlencode(params) response = urlfetch.fetch(url) html = response.content soup = BeautifulSoup(html) list_node = soup.find("ul", { "class" : "b-holster b-search-result" }) if list_node != None: regex = re.compile(r'<.*?>') b_nodes = list_node.findAll("b") result = [] for b_node in b_nodes: data = regex.split(b_node.renderContents()) try: time = [datetime.datetime.strptime(x, '%H:%M').time() for x in data] result.append(TrainTiming(time[0], time[1])) except: pass memcache.add(key, result, 60*60) #@UndefinedVariable return result
def handler(sock, url): htmlsource=sock.read() soup = BeautifulSoup(htmlsource) content = soup.find(id=re.compile("postmessage_\d+"),name="td") if content is None: return "failed to read content" return unicode(content)
def get_organic_data(html_data): bs = BeautifulSoup(str(html_data)) div_filter = bs.find('div',{'id':'ires'}) if div_filter: contents = div_filter.findAll('li',{'class':'g'}) return contents return None
def assert_no_error_message_in_response(self, response): """Check that response has no error messages.""" soup = BeautifulSoup(response) el = soup.find("p", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents)) el = soup.findAll("label", "alert-error") if el: self.fail("error message found in response unexpectedly: {}".format(el.contents))
def parse_matchup(league, team_id, week=None): team_info = {} params = {'mid1': team_id} if week: params.update({'week': week}) matchup_url = build_url(league_id=league.id, page='matchup', params=params, access_code=league.access_code) soup = BeautifulSoup(urlfetch.fetch(matchup_url).content).find('div', attrs={'id': 'yspmain'}) if True: #try: week = int(soup.find('div', attrs={'id': 'matchupweeksubnav'}).find('li', 'current selected').find('a').contents[0]) else: #except: week = 0 matchup_soup = soup.find('div', attrs={'id': 'matchup'}) team_name = matchup_soup.find('h2').find('a')['title'] lineup_soup = matchup_soup.find('table', attrs={'id': 'statTable1'}).find('tbody') lineup = [] for lineup_row in lineup_soup.findAll('tr')[:-1]: position = str(lineup_row.find('td').contents[0].strip()) player_cell = lineup_row.find('td', 'player') try: player = player_cell.find('div').find('a').contents[0].strip() except: player = None try: player_status = player_cell.find('div', 'detail').find('span', 'status').contents[0].strip() except: player_status = None opp = str(lineup_row.find('td', 'opp').contents[0].strip()).replace(' ', '') projected_points = float(lineup_row.find('td', 'stat wide').contents[0]) lineup.append({'position': position, 'player': player, 'status': player_status, 'projected': projected_points, 'opp': opp != 'Bye' and opp or None}) return {'name': team_name, 'lineup': lineup, 'week': week}
def parse_organic_contents(raw_content, organic_pos): data_dict = {} data_dict['position'] = organic_pos b = BeautifulSoup(raw_content) rtitle = b.find('a') headline = p.sub('', str(rtitle)) data_dict['title'] = headline display_url = parse_display_url(str(raw_content)) data_dict['display_url'] = display_url rhref = b.find('a', href=True) url = str(rhref['href']) data_dict['url'] = ul.unquote(url) rtext = b.findAll('div', {'class': 's'}) text = p.sub('', str(rtext)) data_dict['text'] = text.replace(']', '').replace('[', '') return data_dict
def parse_organic_contents(raw_content,organic_pos): data_dict = {} data_dict['position'] = organic_pos b = BeautifulSoup(raw_content) rtitle = b.find('a') headline = p.sub('',str(rtitle)) data_dict['title'] = headline display_url = parse_display_url(str(raw_content)) data_dict['display_url'] = display_url rhref=b.find('a',href=True) url = str(rhref['href']) data_dict['url'] = ul.unquote(url) rtext=b.findAll('div',{'class':'s'}) text=p.sub('',str(rtext)) data_dict['text'] = text.replace(']','').replace('[','') return data_dict
def parse_page(writer, catalogue, page=1): print 'Parsing page %s' % page url = urllib.urlopen(URL % (catalogue, page)) soup = BeautifulSoup(url) table = soup.find('table', attrs={'class': 'snippets'}) for tr in table.findAll('tr'): # get name of the page name = tr.td.h4.a.string # get URL of the page url = tr.td.h4.a['href'].encode('utf-8') #get stats info stats = '?' stats_element = tr.find('p', attrs={'class': 'Stats'}) if stats_element: stats = stats_element.strong.nextSibling.string[1:-11].replace(' ', '') if stats == 'wtrakc': stats = '?' # get price price = tr.find('td', attrs={'class': 'Price'}).strong.string[0:-12] # calculate CPM cpm = '?' try: cpm = (float(price)*30) / int(stats) * 1000 except: cpm = '?' # write to the file row = [name, url, stats, price.replace('.', ','), str(cpm).replace('.', ',')] print row writer.writerow(row) # find last page of the catalogue anchors = soup.findAll('a', href=re.compile('/networks/[0-9]+/websites\?page=[0-9]+')) if not anchors: return pages = [] for anchor in anchors: number = re.match('/networks/[0-9]+/websites\?page=([0-9]+)', anchor['href']).group(1) pages.append(int(number)) pages.sort() last = pages[-1] # parse next page if exists if last > page: next = page + 1 parse_page(writer, catalogue, next)
def load(self): league_soup = BeautifulSoup(urllib2.urlopen(league_url).read()) if league_soup: self.name = League.name(league_soup) self.mb = MessageBoard(self) team_rows = league_soup.find('table', attrs={'id': 'standingstable'}).tbody.findAll('tr') teams = [Team(self, team_id) for team_id in xrange(1,2)] # xrange(1, len(team_rows) + 1)] for team in teams: print "%s, %s, \"%s\" %s\n" % (team.name, team.record, team.smack, team.roster) '''
def get_shows(): """docstring for get_shows""" html = retrieve_url(BASE_URL) soup = BeautifulSoup(html, fromEncoding="utf-8") #print soup #print "Autómánia" showsHtml = soup.find(id="topnav04-ul").findAll("li") shows = [] for show in showsHtml: shows.append({"title" : show.a.string, "url" : show.a['href']}) return shows
def basic_league_info(league_id, access_code=None): league_url = build_url(league_id=league_id, access_code=access_code) league_soup = BeautifulSoup(urlfetch.fetch(url=league_url).content).find('div', attrs={'id': 'yspmain'}) if league_soup.find('div', attrs={'class': 'errors'}): valid_league = False else: valid_league = True if valid_league: league_name = str(league_soup.find('h1').contents[0].strip()) try: teams_count = len(league_soup.find('table', attrs={'id': 'standingstable'}).find('tbody').findAll('tr')) except: teams_count = 0 return {'name': league_name, 'teams_count': teams_count, 'errors': False} else: return {'errors': True}
def getWeatherInfo(self, my_phone): for user in self.users: url = self.url + self.province_map[user.province.encode('gbk')] + '/' + self.city_map[user.city.encode('gbk')] + '.html' #构造查询URL #print url page = urllib2.urlopen(url).read().decode('GBK').encode('utf-8') soup = BeautifulSoup(page) #print page.decode('utf-8').encode('gbk') city_body = soup.find('div', {'class': 'w365border city_body'}) weather_info = city_body.findAll('div', {'class': 'weather_div'}) self.sendSMS(my_phone, weather_info[1], user) #明天的天气 self.sendSMS(my_phone, weather_info[2], user) # 后天的天气
def __init__(self, league, team_id): team_url = "http://%s%s%s/%d?pak=%s" % (league.sport, YAHOO_FB, league.league_id, team_id, league.access_code) team_soup = BeautifulSoup(urllib2.urlopen(team_url).read()).find('div', attrs={'id': 'bd'}) team_info_soup = team_soup.find('div', attrs={'id': 'teaminfo'}) self.name = clean(team_info_soup.h1.em.contents[0]) self.record = Team.parse_record(team_info_soup) try: self.smack = clean(team_info_soup.find('p', attrs={'id': 'smacktext'}).contents[0]) except: self.smack = '' self.roster = Roster(league, team_id).players
def parse_response(self): soup = BeautifulSoup(self.response) head = soup.find("head") self.max_points = int( _get_value_from_soup(head, "meta", "value", {"name": "max-points"}, 0)) if _get_value_from_soup(head, "meta", "value", {"name": "status"}) == "accepted": self.is_accepted = True meta_title = _get_value_from_soup(head, "meta", "content", {"name": "DC.Title"}) if meta_title: self.meta["title"] = meta_title else: title = soup.find("title") if title: self.meta["title"] = title.contents self.meta["description"] = _get_value_from_soup( head, "meta", "content", {"name": "DC.Description"}, "") points = _get_value_from_soup(head, "meta", "value", {"name": "points"}) if points != None: self.points = int(points) self.is_graded = True self.is_accepted = True exercise_div = soup.body.find("div", {"id": "exercise"}) if exercise_div != None: self.content = exercise_div.renderContents() else: self.content = soup.body.renderContents()
def get(self): # self.response.headers['Content-Type'] = 'text/plain' br = create_mechanize() base_url = "https://secure.hilton.com%s" login_landing_url = base_url % "/en/hhonors/login/login.jhtml" br.open(login_landing_url) br.select_form(name="loginForm") br.form['Username'] = '******' br.form['password'] = '******' br.submit() soup = BeautifulSoup(br.response().read()) session_id = soup.find('form', attrs={'name': 'logout'})['action'].split(';jsessionid=')[1] hotel_url = "http://doubletree.hilton.com/en/dt/hotels/index.jhtml;jsessionid=%s?ctyhocn=CHINPDT" % (session_id) self.response.out.write("%s\n\n" % hotel_url) br.open(hotel_url) #a = br.select_form(name="rewardSearch") br.form.set_all_readonly(False) br.form.find_control(name="flexCheckInDay", kind="list").value = ["3"] br.form.find_control(name="flexCheckInMonthYr", kind="list").value = ["December 2010"] #br.form.find_control(name="checkInDay", kind="list").value = ["3"] #br.form.find_control(name="checkInMonthYr", kind="list").value = ["December 2010"] #br.form.find_control(name="checkOutDay", kind="list").value = ["5"] #br.form.find_control(name="checkOutMonthYr", kind="list").value = ["December 2010"] br.form.find_control(name="los", kind="list").value = ["2"] br.form["isReward"] = "true" br.form["flexibleSearch"] = "true" br.form["source"] = "hotelResWidget" br.submit() self.response.out.write("%s\n\n" % br.geturl()) br.select_form(name="loginForm") br.form['Username'] = '******' br.form['password'] = '******' br.submit() self.response.out.write("%s\n\n" % br.geturl()) for form in br.forms(): pass # self.response.out.write("%s\n\n\n\n\n" % form) self.response.out.write("\n\n\n\n\n==============\n\n\n\n\n")
def get_shows(): """docstring for get_shows""" html = retrieve_url(BASE_URL) # fix the f****d up encoding in original document html = re.sub("Medi.n WebAudit RTLcsoport rtlmost.hu", "", html) soup = BeautifulSoup(html) #print soup.prettify #print "Autómánia" showsHtml = soup.find(id="topnav04-ul").findAll("li") #remove the last item showsHtml.pop(len(showsHtml)-1) shows = [] for show in showsHtml: shows.append({"title" : show.a.string, "url" : show.a['href']}) return shows
def get(self): def valid_setcode(soup): try: top_msg_div = soup.find('div', attrs={'id': 'topMsgDiv'}) if top_msg_div.find('span', attrs={'class': 'error'}) and bool(top_msg_div.find('p').contents[0].strip()): return False else: return True except: return True self.response.headers['Content-Type'] = 'text/plain' try: set_code = int(self.request.get('set_code', 0)) except: set_code = None if StarwoodSetCode.get_by_key_name(StarwoodSetCode.calc_key_name(set_code)): self.response.out.write("SET code entity already created.") return try: hotel_id = int(self.request.get('hotel_id', 0)) except: hotel_id = None name = None if set_code and hotel_id: check_in = date.today() + relativedelta(months=1) check_out = check_in + relativedelta(days=1) #url = "https://www.starwoodhotels.com/preferredguest/search/ratelist.html?corporateAccountNumber=%d&lengthOfStay=1&roomOccupancyTotal=001&requestedChainCode=SI&requestedAffiliationCode=SI&theBrand=SPG&submitActionID=search&arrivalDate=2010-09-15&departureDate=2010-09-16&propertyID=%d&ciDate=09/15/2010&coDate=09/19/2010&numberOfRooms=01&numberOfAdults=01&roomBedCode=&ratePlanName=&accountInputField=57464&foo=5232" url = "https://www.starwoodhotels.com/preferredguest/search/ratelist.html?arrivalDate=%s&departureDate=%s&corporateAccountNumber=%d&propertyID=%d" \ % (helper.date_to_str(check_in), helper.date_to_str(check_out), set_code, hotel_id) try: response = urlfetch.fetch(url, deadline=10) except DownloadError, details: logging.error("DownloadError: %s" % details) response = None if response: soup = BeautifulSoup(response.content) if valid_setcode(soup): try: name = str(soup.find('table', attrs={'id': 'rateListTable'}).find('tbody').find('tr').find('td', attrs={'class': 'rateDescription'}).find('p').contents[0].strip()) except: name = None
def parse_matchup_score2(league, team_id): generic_matchup_url = build_url(league_id=league.id, page='matchup', access_code=league.access_code) try: soup = BeautifulSoup(urlfetch.fetch(generic_matchup_url).content) matchup_path = soup.find('div', attrs={'class': 'scoreboard'}).find('td', attrs={'class': 'last'}).find('a')['href'] except: matchup_path = None matchup_url = None if matchup_path: params = {} for p in matchup_path.split('?')[1].split('&'): k, v = p.split('=') params[k] = v matchup_url = build_url(league_id=league.id, page='matchup', params=params, access_code=league.access_code) return matchup_url
def getRes(self, url): page = urllib2.urlopen(url).read().\ decode('GBK').encode('utf-8') ret_info = [] soup = BeautifulSoup(page) #获取最终检索页面的商品列表 deal_table = soup.find('table', {'class': 'deals'}) #first, 价格最低,假如不符合用户要求,那么下面的也不可能符合用户 #要求,可直接返回 min_price = deal_table.findAll('tr', {'class': 'first'}, limit = 1)[0] try: goods_name = str(min_price.h3.contents[0].string) except Exception, e: self.py_log.log("获取商品名称失败", self.py_log.get_file_name(), self.py_log.get_line()) goods_name = "NULL"
def web_segments(self): segment_props = [] response = fetch_url("http://washington.cbslocal.com/audio-on-demand/the-sports-junkies/") if response and response.status_code == 200: page_soup = BeautifulSoup(response.content) try: segment_blocks = page_soup.find('div', {'class': 'cols'}).find('ul').findAll('li') except: segment_blocks = None if segment_blocks: segment_blocks.reverse() for segment_block in segment_blocks: props = self.parse_web_segment(segment_block) self.response.out.write("%s\n" % props) segment_props.append(props) return segment_props
def handler(content, url): m=re.search(".*?(\d+)\.htm",url) if m is None or len(m.groups())<1: return "Failed to parser url" com_url = "http://www.cnbeta.com/comment/normal/%s.html"%m.groups()[0] comment = "" try: sock = urllib2.urlopen(com_url) htmlsource=sock.read().decode('gb18030','replace').encode('utf-8') comment_soup = BeautifulSoup(htmlsource) comment = "".join("%s<br/>"%x.text for x in comment_soup.findAll("dd",{"class":"re_detail"})) except urllib2.HTTPError: comment = "" content_soup = BeautifulSoup(content,fromEncoding="gbk") content = content_soup.find("div",{"id":"news_content"}) if content is None: return "content not found" result = u"%s<br/>Comment%s"%(content,comment) return unicode(result)
def parse(html, points_array, steps_array): soup = BeautifulSoup(html) routes = [] route_index = 0 while True: route_node = soup.find("div", { "id" : "ts_route_" + str(route_index) }) if route_node == None: break; directions = [] total_duration = 0 steps = steps_array[route_index] points = points_array[route_index] for index in range(len(steps) - 1): step_node = route_node.find(attrs = { "id" : "step_" + str(route_index) + "_" + str(index) + "_segment" }) step = common.RouteStep() if step_node != None: step.direction = get_node_text(step_node.find(attrs = { "class" : "dir-ts-direction" })) segment_text = get_nodes_text(step_node.findAll(attrs = { "class": "dirsegtext" })) if segment_text != '': if step.direction.find('Unknown') > 0: # Prevent 'Walk to Unknown' direction step.direction = segment_text else: step.direction += ': ' + segment_text step.addinfo = get_nodes_text(step_node.findAll(attrs = { "class" : re.compile('^dir-ts-addinfo.*') })).replace('(','').replace(')','') step.duration = parse_duration(step.addinfo) step.initial_duration = step.duration total_duration += step.duration transport_type = get_transport(step.direction) if transport_type == None or transport_type == 'Walk': step.direction = clean_walk_direction(step.direction) else: line_number = get_node_text(step_node.find(attrs = { "class" : "trtline" })) step.service_interval = parse_service_interval(step.addinfo) step.transport = common.Transport(transport_type, line_number, step.service_interval) step.direction = _(step.transport.type) step.transport.stops = parse_stops(step.addinfo) if step.transport.is_subway(): step.direction += utils.subway_color(' ' + _('line') + ' ' + str(step.transport.line_number), step.transport.line_number); else: step.direction += ' ' + _('number') + ' ' + step.transport.line_number step.start_name = clean_walk_direction(get_node_text(step_node.find('b'))) if step_node.nextSibling != None: arrive_node = step_node.nextSibling.find(text = re.compile('^Arrive.*')) if arrive_node != None: step.end_name = clean_walk_direction(get_node_text(arrive_node.nextSibling)) start_point = points[steps[index]['depPoint']] end_point = points[steps[index]['arrPoint']] step.start_location = common.GeoPoint(start_point['lat'], start_point['lng']) step.end_location = common.GeoPoint(end_point['lat'], end_point['lng']) if not step.is_walk(): directions.append(step) routes.append(common.Route(directions, 'google')) route_index += 1 return routes
from lib.BeautifulSoup import BeautifulSoup agent="""Sosospider+(+http://help.soso.com/webspider.htm)""" blog_url = 'http://blog.sina.com.cn/s/articlelist_1517582220_0_1.html' spider_handle = urllib2.urlopen(blog_url) blog_content = spider_handle.read() soup = BeautifulSoup(blog_content, fromEncoding='utf-8') item_list = soup.findAll('span', {'class':'atc_title'}) urls = ['http://blog.csdn.net/heiyeshuwu/archive/2010/12/19/6085876.aspx'] #for item in item_list: # urls.append(item.a['href']) for url in urls: request = urllib2.Request(url) request.add_header('User-Agent', agent) handle = urllib2.urlopen(request).read() article_soup = BeautifulSoup(handle, fromEncoding='utf-8') title = article_soup.find('h1',{'class':'title_txt'}) content = article_soup.find('div',{'id':'sina_keyword_ad_area2'}) # tmp = [] # for c in content.contents: # print type(c) # tmp.append(c.__str__('utf-8')) print url print title.contents print title.contents[2].replace('\t', '').replace('\r\n', '') # print ''.join(tmp) exit()
def Items(self): """ 生成器,返回一个元组 对于HTML:section,url,title,content 对于图片,mime,url,filename,content """ cnt4debug = 0 decoder = AutoDecoder() for section, url in self.feeds: cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break opener = URLOpener(self.host) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content) content = self.preprocess(content) soup = BeautifulSoup(content) try: title = soup.html.head.title.string except AttributeError: logging.error('object soup invalid!(%s)'%url) continue title = self.processtitle(title) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): # 鍐呭祵鍑芥暟 while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.findAll(remove_tags): tag.extract() for id in remove_ids: for tag in soup.findAll(attrs={"id":id}): tag.extract() for cls in remove_classes: for tag in soup.findAll(attrs={"class":cls}): tag.extract() for attr in remove_attrs: for tag in soup.findAll(attrs={attr:True}): del tag[attr] for tag in soup.findAll(attrs={"type":"text/css"}): tag.extract() for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)): cmt.extract if self.keep_image: self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) else: for img in soup.findAll('img'): img.extract() self.soupprocessex(soup) content = soup.renderContents('utf-8').decode('utf-8') soup = None content = self.postprocess(content) yield (section, url, title, content)
def fulltext(self, url, decoder): #因为图片文件占内存,为了节省内存,这个函数也做为生成器 if self.fulltext_by_instapaper: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: logging.error('err(%d) to fetch %s.' % (status_code,url)) return if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content) content = self.preprocess(content) soup = BeautifulSoup(content) try: title = soup.html.head.title.string except AttributeError: logging.error('object soup invalid!(%s)'%url) return title = self.processtitle(title) soup.html.head.title.string = title if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): # 内联函数 while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.findAll(remove_tags): tag.extract() for id in remove_ids: for tag in soup.findAll(attrs={"id":id}): tag.extract() for cls in remove_classes: for tag in soup.findAll(attrs={"class":cls}): tag.extract() for attr in remove_attrs: for tag in soup.findAll(attrs={attr:True}): del tag[attr] for tag in soup.findAll(attrs={"type":"text/css"}): tag.extract() for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)): cmt.extract if self.keep_image: self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) else: for img in soup.findAll('img'): img.extract() self.soupprocessex(soup) content = soup.renderContents('utf-8').decode('utf-8') soup = None yield (title, None, None, content)
def parse(html, points_arr, steps_arr): soup = BeautifulSoup(html) routes = [] route_index = 0 while True: route_node = soup.find("div", { "id" : "ts_route_" + str(route_index) }) if route_node == None: break; directions = [] total_duration = 0 steps = steps_arr[route_index] points = points_arr[route_index] for index in range(len(steps)): step = route_node.find(attrs = { "id" : "step_" + str(route_index) + "_" + str(index) + "_segment" }) direction = '' duration = '' addinfo = '' line_number = '' arrive ='' addinfo_duration = '' if step != None: direction_node = step.find(attrs = { "class" : "dir-ts-direction" }) if direction_node != None: action_node = direction_node.find(attrs = { "class" : "action" }) location_node = direction_node.find(attrs = { "class" : "location" }) if action_node != None and location_node != None: direction = str(action_node.text) + ' ' + str(location_node.text) else: direction = str(direction_node.renderContents()) if step.nextSibling != 'None': arrive_node = step.nextSibling.find(text = re.compile('^Arrive.*')) if arrive_node != None: arrive = arrive_node.nextSibling.text addinfo_nodes = step.findAll(attrs = { "class" : re.compile('^dir-ts-addinfo.*') }) addinfo = remove_html_tags(get_nodes_text(addinfo_nodes)) addinfo_duration_m = re.search('Service runs every\s(\d+)\smin', addinfo) if addinfo_duration_m != None: addinfo_duration = addinfo_duration_m.group(1) duration = parse_duration(addinfo) total_duration += duration segtext_nodes = step.findAll(attrs = { "class": "dirsegtext" }) direction += ', ' + get_nodes_text(segtext_nodes) line_number_node = step.find(attrs = { "class" : "trtline" }) if line_number_node != None: line_number = str(line_number_node.text) start_point = points[steps[index]['depPoint']] end_point = points[steps[index]['arrPoint']] directions.append({ 'direction': remove_html_tags(direction), 'duration': duration, 'addinfo': addinfo, 'addinfo_duration': addinfo_duration, 'line_number' : line_number, 'arrive': arrive, 'start_location': { 'lat': start_point['lat'], 'lng': start_point['lng'] }, 'end_location': { 'lat': end_point['lat'], 'lng': end_point['lng'] } }) routes.append({'directions' : directions, 'total_duration': total_duration}) route_index += 1 return routes
def view(): addon_handle = int(sys.argv[1]) addon = xbmcaddon.Addon() addonname = addon.getAddonInfo('name') args = urlparse.parse_qs(sys.argv[2][1:]) xbmcplugin.setContent(addon_handle, 'movies') cat=args.get('cat', None) page = args.get('page', None) link = args.get('link', None) catalogues=[{'label':'\x56\x69\x64\x65\x6F\x20\x4D\xE1\xBB\x9B\x69'.decode('utf-8'),'id':'video/new/'}, {'label':'Video Hot','id':'video/hot/'}] #play link if link!=None: link_video=link[0] if link_video.startswith(web_url): r = requests.get(link[0]) html = r.text #xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html) video_src=soup.find('embed', attrs={'id':'zplayer'}) video_flashvars=video_src.get('flashvars') args_video = urlparse.parse_qs(video_flashvars) link_video=args_video['file'][0] xbmc.Player().play(link_video) return #Load cats if cat==None: for cat in catalogues: li = xbmcgui.ListItem(cat['label']) urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'cat':cat['id']}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') xbmcplugin.endOfDirectory(addon_handle) return #Load noi dung cat if cat!=None: if page==None: page=1 else: page=int(page[0]) r = requests.get(web_url+cat[0]+str(page)) html = r.text xbmc.log(html.encode('utf-8')) soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) data_List=soup.findAll('a',attrs={'class':'play'}) #load item menu for item in data_List: link_item=web_url+item.get('href') if item.get('data-youtubeid')!='': link_item="plugin://plugin.video.youtube/play/?video_id="+item.get('data-youtubeid') img_item=item.find('img') img_src=img_item.get('src') img_alt=img_item.get('alt') li = xbmcgui.ListItem(img_alt) li.setThumbnailImage(img_src) li.setInfo(type='image',infoLabels="") urlList = CMDTools.build_url(base_url,{'web':get_Web_Name(), 'link':link_item, 'type':cat[0]}) xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li) #Tao nut next li = xbmcgui.ListItem("Next") urlList=CMDTools.build_url(base_url,{'web':web_name, 'cat':cat[0],'page': page+1}); xbmcplugin.addDirectoryItem(handle=addon_handle, url=urlList, listitem=li, isFolder=True) xbmc.executebuiltin('Container.SetViewMode(501)') #xbmc.executebuiltin("ClearSlideshow") #xbmc.executebuiltin("SlideShow(,,notrandom)") xbmcplugin.endOfDirectory(addon_handle) return xbmcplugin.endOfDirectory(addon_handle)
def save(self): self.title = bleach.clean(self.title) self.body = bleach.clean(self.body, tags=['code', 'a', 'p', 'span', 'div', 'strong', 'b', 'em', 'i', 'strike', 'cite', 'mark', 'small', 'blockquote', 'figure', 'figcaption', 'strong', 'sub', 'sup', 'img', 'iframe', 'br', 'pre', 'hr'], attributes=['style', 'title', 'src', 'frameborder', 'width', 'height', 'alt', 'href', 'target'], styles=['width', 'height', 'font-size', 'font-family', 'text-decoration', 'color', 'background-color', 'text-align', 'padding-left']) self.last_updated = datetime.datetime.now() has_image = False has_video = False self.image = None self.video = None soup = BeautifulSoup(self.body) img = soup.find('img') if img and hasattr(img, 'src') and re.search('tiny_mce/plugins/emotions', img['src'].lower()) is None: self.image = img['src'] has_image = True iframes = soup.findAll('iframe') for item in iframes: if re.match('^http(s)?://(.+\.)?(youtube.com|youtu.be)/', item['src'].lower()) is not None: if self.video is None: self.video = item['src'] has_video = True else: item.decompose() is_insert = not self.is_saved() excerpt = strip_tags(self.body).strip() self.excerpt = '%s...' % excerpt[:253] if len(excerpt) > 253 else excerpt diff = None if not is_insert: previous = self.__class__.get(self.key()) Tag.decrease(previous.tags) if previous.body != self.body: diff = DIFFER.make_table(previous.body, self.body) db.run_in_transaction_options(xg_on, super(self.__class__, self).put) if is_insert: db.run_in_transaction_options(xg_on, self.author.increase_article_count) for item in self.category.get_path(): db.run_in_transaction_options(xg_on, item.increase_article_count) elif diff: db.run_in_transaction_options(xg_on, ArticleHistory(article=self, diff=diff).put) key_name = str(self.key().id()) if has_image: ImageArticle.get_or_insert(key_name, article=self) else: found = ImageArticle.gql('WHERE article = :1', self).get() if found != None: found.delete() if has_video: VideoArticle.get_or_insert(key_name, article=self) else: found = VideoArticle.gql('WHERE article = :1', self).get() if found != None: found.delete() if is_insert: Subscription.get_or_insert('%s-%s' % (self.key().id(), self.author.key()), article=self, user=self.author) return self
def TPB(book=None, test=False): errmsg = '' provider = "TPB" host = lazylibrarian.CONFIG['TPB_HOST'] if not host.startswith('http'): host = 'http://' + host providerurl = url_fix(host + "/s/?") cat = 0 # 601=ebooks, 102=audiobooks, 0=all, no mag category if 'library' in book: if book['library'] == 'AudioBook': cat = 102 elif book['library'] == 'eBook': cat = 601 elif book['library'] == 'magazine': cat = 0 sterm = makeUnicode(book['searchterm']) page = 0 results = [] minimumseeders = int(lazylibrarian.CONFIG['NUMBEROFSEEDERS']) - 1 next_page = True while next_page: params = { "q": book['searchterm'], "category": cat, "page": page, "orderby": "99" } searchURL = providerurl + "?%s" % urllib.urlencode(params) next_page = False result, success = fetchURL(searchURL) if not success: # may return 404 if no results, not really an error if '404' in result: logger.debug("No results found from %s for %s" % (provider, sterm)) success = True else: logger.debug(searchURL) logger.debug('Error fetching data from %s: %s' % (provider, result)) errmsg = result result = False if test: return success if result: logger.debug('Parsing results from <a href="%s">%s</a>' % (searchURL, provider)) soup = BeautifulSoup(result) # tpb uses a named table table = soup.find('table', id='searchResult') if table: rows = table.findAll('tr') else: rows = [] if len(rows) > 1: rows = rows[1:] # first row is headers for row in rows: td = row.findAll('td') if len(td) > 2: try: title = unaccented( str(td[1]).split('title=')[1].split('>')[1].split( '<')[0]) magnet = str(td[1]).split('href="')[1].split('"')[0] size = unaccented( td[1].text.split(', Size ')[1].split('iB')[0]) size = size.replace(' ', '') mult = 1 try: if 'K' in size: size = size.split('K')[0] mult = 1024 elif 'M' in size: size = size.split('M')[0] mult = 1024 * 1024 elif 'G' in size: size = size.split('G')[0] mult = 1024 * 1024 * 1024 size = int(float(size) * mult) except (ValueError, IndexError): size = 0 try: seeders = int(td[2].text) except ValueError: seeders = 0 if minimumseeders < int(seeders): # no point in asking for magnet link if not enough seeders magurl = '%s/%s' % (host, magnet) result, success = fetchURL(magurl) if not success: logger.debug('Error fetching url %s, %s' % (magurl, result)) else: magnet = None new_soup = BeautifulSoup(result) for link in new_soup.findAll('a'): output = link.get('href') if output and output.startswith('magnet'): magnet = output break if not magnet or not title: logger.debug('Missing magnet or title') else: results.append({ 'bookid': book['bookid'], 'tor_prov': provider, 'tor_title': title, 'tor_url': magnet, 'tor_size': str(size), 'tor_type': 'magnet', 'priority': lazylibrarian.CONFIG['TPB_DLPRIORITY'] }) logger.debug('Found %s. Size: %s' % (title, size)) next_page = True else: logger.debug('Found %s but %s seeder%s' % (title, seeders, plural(seeders))) except Exception as e: logger.error("An error occurred in the %s parser: %s" % (provider, str(e))) logger.debug('%s: %s' % (provider, traceback.format_exc())) page += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < page: logger.warn( 'Maximum results page search reached, still more results available' ) next_page = False logger.debug("Found %i result%s from %s for %s" % (len(results), plural(len(results)), provider, sterm)) return results, errmsg
br.submit() br.select_form(nr=1) req = br.click(type="submit", nr=1) br.open(req) br.select_form(nr=1) br.find_control(name="sel_subj", nr=1).value = [R_SUBJECT] br["sel_crse"] = R_COURSE response = br.submit() pageSoup = BeautifulSoup(response.read()) # Scrape information from pageText tableHeads = [] tableData = [] for row in pageSoup.find('table', {'class': "datadisplaytable"}).findAll('tr'): if len(row.findAll('th')) > 1 and not tableHeads: # This is a header row for elem in row.findAll('th'): tableHeads.append(re.sub(r'<.*?>', '', str(elem))) else: # This is a content row ct = [] for elem in row.findAll('td'): ct.append(re.sub(r'<.*?>', '', str(elem))) tableData.append(ct) idxSection = tableHeads.index('Sec') idxRem = tableHeads.index('Rem') sectionsFree = False
def determineMinSDK(): """ Determines the minimum SDK version supported by the vulnerable application\n As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml """ #determine minimum supported versions common.minSdkVersion=0 common.sdk = common.xmldoc.getElementsByTagName("uses-sdk") determineSdk='' if len(common.sdk)>0: if 'android:minSdkVersion' in common.sdk[0].attributes.keys(): try: common.minSdkVersion = common.sdk[0].attributes['android:minSdkVersion'].value logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) except Exception as e: common.logger.error("Something went wrong trying to determine the version from the manifest: " + str(e)) if common.minSdkVersion==0: if common.source_or_apk==2: common.minSdkVersion=findGradle() if common.minSdkVersion==0: common.logger.info("We were unable to find the minimum SDK version in your source.") determineSdk='m' else: logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) else: common.compare(common.sdk.length,1,common.config.get('qarkhelper', 'USESDK_MISS'), 'false') print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN') while True: determineSdk=raw_input("Which option would you prefer? (P)lay, (M)anual") if determineSdk.lower() in ('p','m'): break; else: determineSdk=raw_input("Please enter either (p) or (m):") if determineSdk.lower() == 'p': #get package name from manifest if possible #make call to Play store #determine API version from https://play.google.com/store/apps/details?id=<package name> # will need to adjust the sdk[0] value for the checks below for a in common.xmldoc.getElementsByTagName('manifest'): if 'package' in a.attributes.keys(): print common.config.get('qarkhelper', 'PACK_FOUND') package_name=a.attributes['package'].value print package_name else: package_name=raw_input(common.config.get('qarkhelper', 'NO_PACK_NAME')) try: logger.info(common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION')) play_url="https://play.google.com/store/apps/details?id=" play_url+=package_name print play_url page=urllib2.urlopen(play_url) html=BeautifulSoup(page.read()) play_version=html.find(itemprop="operatingSystems") plat_version=re.findall('\d+.\d+', play_version.contents[0]) if plat_version: plat_version=[str(item) for item in plat_version] api_plat_map=[] api_plat_map.append(['1','1.0']) api_plat_map.append(['2','1.1']) api_plat_map.append(['3','1.5']) api_plat_map.append(['4','1.6']) api_plat_map.append(['5','2.0']) api_plat_map.append(['6','2.0.1']) api_plat_map.append(['7','2.1']) api_plat_map.append(['8','2.2']) api_plat_map.append(['9','2.3']) api_plat_map.append(['10','2.3.3']) api_plat_map.append(['11','3.0']) api_plat_map.append(['12','3.1']) api_plat_map.append(['13','3.2']) api_plat_map.append(['14','4.0']) api_plat_map.append(['15','4.0.3']) api_plat_map.append(['16','4.1']) api_plat_map.append(['17','4.2']) api_plat_map.append(['18','4.3']) #Webviews have critical vuln, no more patches from Google api_plat_map.append(['19','4.4']) api_plat_map.append(['20','4.4']) # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes api_plat_map.append(['21','5.0']) api_plat_map.append(['22','5.1']) # This is latest version, we'll assume this for newer, until update #TODO - double check this, adding 5.1 may have broken it for a in api_plat_map: if StrictVersion(str(plat_version[0]))>=StrictVersion(str(a[1])): common.minSdkVersion=a[0] logger.info(common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) manual=raw_input(common.config.get('qarkhelper', 'SDK_VALUE_MANUAL')) else: print common.config.get('qarkhelper', 'CANT_DET_PLAY') #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken except HTTPError, e: print str(e); logger.error(common.config.get('qarkhelper', 'MIN_SDK_PLAY_STORE_FAILED')) elif (determineSdk.lower()=='m' or common.minSdkVersion==0): #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case print common.term.cyan + common.term.bold + str(common.config.get('qarkhelper','NO_MIN_SDK')).decode('string-escape').format(t=common.term) enterSdk = raw_input(common.config.get('qarkhelper','PROMPT_MIN_SDK')) if enterSdk.lower() == 'y': sdkinput=0 while True: sdkinput = int(raw_input(common.config.get('qarkhelper', 'PROMPT_VER')+common.config.get('qarkhelper','MAX_API_VERSION')+common.config.get('qarkhelper','PROMPT_VER2'))) if 0 < int(sdkinput) <= int(common.config.get('qarkhelper','MAX_API_VERSION')): common.minSdkVersion = int(sdkinput) break else: common.minSdkVersion = 7
def determine_min_sdk(): """ Determines the minimum SDK version supported by the vulnerable application\n As a fallback, it allows the user to search Google PlayStore to identify the minimum SDK version if the data is unavailable in manifest.xml """ #determine minimum supported versions common.minSdkVersion = 0 common.sdk = common.xmldoc.getElementsByTagName("uses-sdk") determineSdk = '' if len(common.sdk) > 0: if 'android:minSdkVersion' in common.sdk[0].attributes.keys(): try: common.minSdkVersion = common.sdk[0].attributes[ 'android:minSdkVersion'].value logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) except Exception as e: common.logger.error( "Something went wrong trying to determine the version from the manifest: " + str(e)) if common.minSdkVersion == 0: if common.source_or_apk == 2: common.minSdkVersion = find_gradle() if common.minSdkVersion == 0: common.logger.info( "We were unable to find the minimum SDK version in your source." ) determineSdk = 'm' else: logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) else: common.compare(common.sdk.length, 1, common.config.get('qarkhelper', 'USESDK_MISS'), 'false') print common.config.get('qarkhelper', 'GEN_OUTPUT_WARN') while True: determineSdk = raw_input( "Which option would you prefer? (P)lay, (M)anual") if determineSdk.lower() in ('p', 'm'): break else: determineSdk = raw_input("Please enter either (p) or (m):") if determineSdk.lower() == 'p': #get package name from manifest if possible #make call to Play store #determine API version from https://play.google.com/store/apps/details?id=<package name> # will need to adjust the sdk[0] value for the checks below for a in common.xmldoc.getElementsByTagName('manifest'): if 'package' in a.attributes.keys(): print common.config.get('qarkhelper', 'PACK_FOUND') package_name = a.attributes['package'].value print package_name else: package_name = raw_input( common.config.get('qarkhelper', 'NO_PACK_NAME')) try: logger.info( common.config.get('qarkhelper', 'DETERMINING_SDK_VERSION')) play_url = "https://play.google.com/store/apps/details?id=" play_url += package_name print play_url page = urllib2.urlopen(play_url) html = BeautifulSoup(page.read()) play_version = html.find(itemprop="operatingSystems") plat_version = re.findall('\d+.\d+', play_version.contents[0]) if plat_version: plat_version = [str(item) for item in plat_version] api_plat_map = [] api_plat_map.append(['1', '1.0']) api_plat_map.append(['2', '1.1']) api_plat_map.append(['3', '1.5']) api_plat_map.append(['4', '1.6']) api_plat_map.append(['5', '2.0']) api_plat_map.append(['6', '2.0.1']) api_plat_map.append(['7', '2.1']) api_plat_map.append(['8', '2.2']) api_plat_map.append(['9', '2.3']) api_plat_map.append(['10', '2.3.3']) api_plat_map.append(['11', '3.0']) api_plat_map.append(['12', '3.1']) api_plat_map.append(['13', '3.2']) api_plat_map.append(['14', '4.0']) api_plat_map.append(['15', '4.0.3']) api_plat_map.append(['16', '4.1']) api_plat_map.append(['17', '4.2']) api_plat_map.append( ['18', '4.3'] ) #Webviews have critical vuln, no more patches from Google api_plat_map.append(['19', '4.4']) api_plat_map.append( ['20', '4.4'] ) # This is actually 4.4W, a wearable only build, I'm assuming it is the same as 4.4 for our purposes api_plat_map.append(['21', '5.0']) api_plat_map.append( ['22', '5.1'] ) # This is latest version, we'll assume this for newer, until update #TODO - double check this, adding 5.1 may have broken it for a in api_plat_map: if StrictVersion(str( plat_version[0])) >= StrictVersion(str(a[1])): common.minSdkVersion = a[0] logger.info( common.config.get('qarkhelper', 'MIN_SDK_VERSION') + str(common.minSdkVersion)) manual = raw_input( common.config.get('qarkhelper', 'SDK_VALUE_MANUAL')) else: print common.config.get('qarkhelper', 'CANT_DET_PLAY') #BUG - not processing the cases of wanting to enter if manually, if the retrieval of the play version is broken except HTTPError, e: print str(e) logger.error( common.config.get('qarkhelper', 'MIN_SDK_PLAY_STORE_FAILED')) elif (determineSdk.lower() == 'm' or common.minSdkVersion == 0): #does not actually become 1, just needs a value, since it wasn't found, so we assume worst case print common.term.cyan + common.term.bold + str( common.config.get('qarkhelper', 'NO_MIN_SDK')).decode( 'string-escape').format(t=common.term) enterSdk = raw_input( common.config.get('qarkhelper', 'PROMPT_MIN_SDK')) if enterSdk.lower() == 'y': sdkinput = 0 while True: sdkinput = int( raw_input( common.config.get('qarkhelper', 'PROMPT_VER') + common.config.get('qarkhelper', 'MAX_API_VERSION') + common.config.get('qarkhelper', 'PROMPT_VER2'))) if 0 < int(sdkinput) <= int( common.config.get('qarkhelper', 'MAX_API_VERSION')): common.minSdkVersion = int(sdkinput) break else: common.minSdkVersion = 7
def parse_display_url(tag): bs = BeautifulSoup(tag) display_url = bs.find('cite') dis_url = p.sub('', str(display_url)) return str(dis_url)