def parse_tcx3(infiles): schema = { 'geometry': 'LineString', 'properties': {} } with collection( "lines.shp", "w", "ESRI Shapefile", schema) as output: for infile in infiles: print "processing %s" % infile soup = bss(open(infile,'r')) ls = [] # Activity for activity in soup.findAll('activity'): # Lap for lap in activity.findAll('lap'): # Track for track in lap.findAll('track'): # Trackpoint for point in track.findAll('trackpoint'): try: coords = [float(x) for x in [point.position.longitudedegrees.string, point.position.latitudedegrees.string]] ls.append(coords) except: coords = None if len(ls) > 2: output.write({ 'properties': { }, 'geometry': mapping(LineString(ls)) })
def play(): import os, requests from BeautifulSoup import BeautifulStoneSoup as bss import gluon.contrib.simplejson as sj import random session.forget() lastfmuser = request.vars.lastfmuser # print 'lastfmuser', lastfmuser api_key = os.environ['LASTFM_API_KEY'] url = 'http://ws.audioscrobbler.com/2.0/?method=user.gettopartists&user=%s&api_key=%s&limit=20' % (lastfmuser, api_key) artists = requests.get(url).content artists = bss(artists) artist_names = artists.findAll('name') random.shuffle(artist_names, random.random) # print artist_names yt_url = 'http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=5&category=Music&v=2&alt=json' yt_list = [] for artist in artist_names: a = artist.text.replace(' ', '+') # print 'searching: ' + a video_json = requests.get(yt_url % a).content video = sj.loads(video_json) # randomly pick one index = random.randint(0,4) title = video['feed']['entry'][index]['media$group']['media$title'] yt_video_id = video['feed']['entry'][index]['media$group']['yt$videoid'] yt_list.append({'title':title['$t'], 'yt_id':yt_video_id['$t']}) return response.json({'yt_list':yt_list})
def getSectionData(securl): usock = urllib2.urlopen(securl) sections = usock.read() usock.close() soup = bss(sections, selfClosingTags = ['s']) return soup
def getSectionData(securl): usock = urllib2.urlopen(securl) sections = usock.read() usock.close() soup = bss(sections, selfClosingTags=['s']) return soup
def getSectionText(rev_sec): usock = urllib2.urlopen(difftext_url % (rev_sec[0], rev_sec[1], rev_sec[1], rev_sec[2])) text = usock.read() usock.close() soup = bss(text) msg = soup.api.query.pages.page.revisions.rev content = msg.string content = content.strip() return content
def token_counter(text): soup = bss(text) plain_text = soup.text tokens = defaultdict(int) for tok in token_regex.finditer(plain_text): token = tok.group(1) if token: token = token.strip() tokens[token] += 1 return tokens
def make_trial_soup(): xml = ''.join(f_data) soup = bs(xml) ssoup = bss(xml) trial_soup = [] #each item of list is BeautifulSoup for i in ssoup('trial'): j = bs(str(i)) trial_soup.append(j) return trial_soup
def getSectionNumbers(item): page_secs = [] usock = urllib2.urlopen(diffsec_url % (item[1], item[2])) sections = usock.read() usock.close() soup = bss(sections, selfClosingTags=["s"]) for x in soup.findAll("s"): try: page_secs.append(int(x["index"])) except ValueError: continue return page_secs
def parse_tcx3(infile): from BeautifulSoup import BeautifulStoneSoup as bss soup = bss(open(infile,'r')) ###### Activity for activity in soup.findAll('activity'): sport = activity['sport'] activityid = activity.id.string ###### Lap for lap in activity.findAll('lap'): lapid = lap['starttime'] time = time_code(lapid) totaltime = float(lap.totaltimeseconds.string) distance = float(lap.distancemeters.string) maxspeed = float(lap.maximumspeed.string) calories = float(lap.calories.string) intensity = lap.intensity.string cadence = float(lap.cadence.string) trigger = lap.triggermethod.string try: avghr = float(lap.averageheartratebpm.value.string) except: avghr = None try: maxhr = float(lap.maximumheartratebpm.value.string) except: maxhr = None ##### Track for track in lap.findAll('track'): ##### Trackpoint for point in track.findAll('trackpoint'): pointid = point.time.string print pointid time = time_code(pointid) cumdist = float(point.distancemeters.string) try: coords = [float(x) for x in [point.position.longitudedegrees.string, point.position.latitudedegrees.string]] except: coords = None try: cadence = int(point.cadence.string) except: cadence = None try: alt = float(point.altitudemeters.string) except: alt = None try: hr = int(point.heartratebpm.value.string) except: hr = None
def get_problem(number): problem= get('http://projecteuler.net/problem='+number, headers={'user-agent': 'euler-init'}) try: if problem.status_code==404: raise Exception('Problem {0} does not exist'.format(str(number))) elif problem.status_code!=200: raise Exception('Error - '+`problem.status_code`) html= bs(problem.text) question= unicode(bss(html.find('h2').text, convertEntities= bss.ALL_ENTITIES)) statement= html.find('div', {'class': 'problem_content'}).text return question, statement except Exception, e: print e return None
def cleanup(infile, outfile): # main routine. takes a file handle for input and output; called at the bottom of the file. text = infile.read() # custom regexes for things that BeautifulSoup can't handle go here. Keep to a minimum. Below examples are for the Encyc. # text = re.sub(r"<\?>",r"<gap reason='omitted' unit='character' />",text) # text = re.sub(r"<\->",r"<gap reason='omitted' unit='bracket' />",text) # text = re.sub(r"<MVO_PIM=\"(.*?)\">",r'<figure><graphic url="\g<1>"></graphic></figure>',text) # text = re.sub(r"<omit=(.*?)>",r"<gap reason='omitted' unit='\g<1>' />",text) print(len(text)) soup = bss(text, selfClosingTags=self_closing) for tag in soup.findAll(): if tag.name in fix_case: tag.name = fix_case[tag.name] print(soup, file=outfile) outfile.close()
def cleanup(infile,outfile): # main routine. takes a file handle for input and output; called at the bottom of the file. text = infile.read() # custom regexes for things that BeautifulSoup can't handle go here. Keep to a minimum. Below examples are for the Encyc. # text = re.sub(r"<\?>",r"<gap reason='omitted' unit='character' />",text) # text = re.sub(r"<\->",r"<gap reason='omitted' unit='bracket' />",text) # text = re.sub(r"<MVO_PIM=\"(.*?)\">",r'<figure><graphic url="\g<1>"></graphic></figure>',text) # text = re.sub(r"<omit=(.*?)>",r"<gap reason='omitted' unit='\g<1>' />",text) print(len(text)) soup = bss(text,selfClosingTags=self_closing) for tag in soup.findAll(): if tag.name in fix_case: tag.name = fix_case[tag.name] print(soup, file=outfile) outfile.close()
def fetchLocalResults(self, url): result = [] xml = urlfetch.fetch(url).content soup = bss(xml) stations = soup.findAll('station') for station in stations: name = station.find('name') band = station.find('band') freq = station.find('frequency') if name and band and freq: res = [] res.append(str(name.contents[0])) channel = band.contents[0] + ' ' + freq.contents[0] res.append(str(channel)) result.append(res) return result
def page(req, path): if splitext(path)[1] == '.html': soup = bss(open(join(settings.SPHINX_PROJECT, path)).read()) head = soup.find('head') first_script = Tag(soup, 'script') first_script['src'] = "../_static/simplecomment.js" first_script['type'] = "text/javascript" second_script = Tag(soup, 'script') second_script['src'] = "../_static/jquery.form.js" second_script['type'] = "text/javascript" head.insert(-1, first_script) head.insert(-1, second_script) counter = 0 page_identity = path.split('.')[0].replace('/', '_') for p in soup.findAll('p'): p['id'] = '%s_%s' %(page_identity, counter) counter += 1 return HttpResponse(str(soup)) else: return HttpResponse(open(join(settings.SPHINX_PROJECT, path)).read())
def fetchCategoryResults(self, url): result = [] xml = urlfetch.fetch(url).content soup = bss(xml) stories = soup.findAll('story') for story in stories: story_id = story['id'] titles = story.findAll('title') if len(titles) > 0: title = titles[0].contents[0] mp3_tag = story.findAll('mp3') if len(mp3_tag) > 0: mp3 = mp3_tag[0].contents[0] text = str(urlfetch.fetch(mp3).content) text = text.replace('\n', '') index = text.find('.mp3') text = text[:index + 4] title_valid = title.replace(' ', '+').replace(':', '-') res = [] res.append(str(title + '')) res.append(text) res.append(title_valid) result.append(res) return result
def parse_tcx3(infile, lyr): soup = bss(open(infile,'r')) # Activity for activity in soup.findAll('activity'): sport = activity['sport'] activityid = activity.id.string # Lap for lap in activity.findAll('lap'): # Track for track in lap.findAll('track'): # Trackpoint for point in track.findAll('trackpoint'): feature = {} pointid = point.time.string feature['time'] = time_code(pointid) feature['activityid'] = time_code(activityid) try: feature['speed'] = float(point.find('ns3:speed').string) except: feature['speed'] = 0 try: feature['coords'] = [float(x) for x in [point.position.longitudedegrees.string, point.position.latitudedegrees.string]] except: coords = None try: feature['alt'] = float(point.altitudemeters.string) except: feature['alt'] = 0 try: feature['bpm'] = int(point.heartratebpm.value.string) except: feature['bpm'] = 0 add_feature(lyr, feature)
#!/usr/bin/env python import sys,urllib, re, string from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulStoneSoup as bss from subprocess import * imgurl = sys.argv[1] print "Getting page title.." soup = BeautifulSoup(urllib.urlopen(imgurl)) title = str(soup.title.string).strip() raw_query = unicode(bss(title, convertEntities=bss.ALL_ENTITIES)) print "Found this:\n", raw_query newurl = "http://www.reddit.com/search?q=" + raw_query print "Searching on reddit.." new_soup = BeautifulSoup(urllib.urlopen(newurl)) results = new_soup.findAll('a', {"class" : "comments"}) if len(results) == 0: print 'Nothing found, sorry.' sys.exit(1) print "Found top result:\n", results[0] go_here = results[0]['href'] print go_here commands = [ 'open',
def format(text): return bss(compressWhiteSpace.sub(' ', stripHtmlTags.sub('', text)), convertEntities=bss.ALL_ENTITIES)
<br/> </noinclude> %s ''' wiki = wikitools.Wiki(config_reader.get("apiurl")) wiki.login(config_reader.get("username"), config_reader.get("password")) # get the number of new profiles on the Guest_left page securl = u'http://en.wikipedia.org/w/api.php?action=parse&page=Wikipedia%3ATeahouse%2FGuests%2FRight_column&prop=sections&format=xml' usock = urllib2.urlopen(securl) sections = usock.read() usock.close() soup = bss(sections, selfClosingTags = ['s']) seccount = soup.findAll('s') if len(seccount) < 11: pass else: # if there are more than 10 intros, get the text of the intros being archived i = len(seccount) #just changed this to be the seccount number, and changed the while loop to count down j = i - 10 output = [] returns = [] while i > 1: texturl1 = u'http://en.wikipedia.org/w/index.php?title=Wikipedia%%3ATeahouse%%2FGuests%%2FRight_column&action=raw§ion=%d' % i usock = urllib2.urlopen(texturl1) sections1 = usock.read()
for commentmeta in item.findAll('wp:commentmeta'): commentmeta.extract() item.find('wp:is_sticky').extract() item.find('wp:ping_status').extract() item.find('wp:comment_status').extract() # item.find('wp:post_date').extract() def make_post(item): title = item.title.text desc = item.description post_name = desc.find('wp:post_name').text content = unicode(desc.find('content:encoded').text) status = desc.find('wp:status').text if status == 'draft': post = Draft(title, post_name, content) elif status == 'publish': date_gmt = desc.find('wp:post_date_gmt').text post = Publish(title, post_name, content, date_gmt) return post if __name__ == "__main__": data = bss(file(XML_FILE).read(), convertEntities=bss.ALL_ENTITIES) items = data.contents[6].findAll('item') cleanup_items(items) posts = map(make_post, items) map(lambda post: post.write_to_file(BLOG_DIR), posts)
def soupify(url): return bss(urlopen(url))
def bs(html): return bss(html)
total = census #Annoyingly, BeautifulSoup requires you to declare ALL self-closing tags yourself; it will badly mangle your text if you miss one, so get this right. self_closing = [] #BeautifulSoup lowercases all element names; to get things closer to standard TEI, I've included a list here which I use to restore them after parsing fix_case = {} for tag in list(census.tags.keys()): fix_case[tag.lower()] = tag if census[tag]["empty"] != 0: self_closing.append(tag) ## Add any custom regex here soup = bss(text,selfClosingTags=self_closing) for tag in soup.findAll(): if tag.name in fix_case: tag.name = fix_case[tag.name] file_contents = str(soup) file_contents = convert_remaining_entities(file_contents, quiet) try: parser = etree.XMLParser(huge_tree=True, remove_blank_text=True, strip_cdata=False) tree = etree.fromstring(file_contents, parser=parser) for el in tree.iter(): ## Tags are defined as el.tag, so to change tag, you do: el.tag = "some_other_tag" ## Attributes are contained in el.attrib where each attribute is a key. To change the type attribute you do: el.attrib['type'] = "some_other_type" if el.tag in xml_tag_mapping: ## Check if the tag should be replaced according to the xml mapping dict el.tag = xml_tag_mapping[el.tag]
<br/> </noinclude> %s ''' wiki = wikitools.Wiki(settings.apiurl) wiki.login(settings.username, settings.password) # get the number of new profiles on the Guest_left page securl = u'http://en.wikipedia.org/w/api.php?action=parse&page=Wikipedia%3ATeahouse%2FGuests%2FRight_column&prop=sections&format=xml' usock = urllib2.urlopen(securl) sections = usock.read() usock.close() soup = bss(sections, selfClosingTags=['s']) seccount = soup.findAll('s') if len(seccount) < 11: pass else: # if there are more than 10 intros, get the text of the intros being archived i = len( seccount ) #just changed this to be the seccount number, and changed the while loop to count down j = i - 10 output = [] returns = [] while i > 1: texturl1 = u'http://en.wikipedia.org/w/index.php?title=Wikipedia%%3ATeahouse%%2FGuests%%2FRight_column&action=raw§ion=%d' % i
def soupify(url): return bss(urlopen(url));
def bs(html): return bss(html, "html.parser")