Esempio n. 1
0
def parse_tcx3(infiles):
    schema = { 'geometry': 'LineString', 'properties': {} }
    with collection(
        "lines.shp", "w", "ESRI Shapefile", schema) as output:
        for infile in infiles:
            print "processing %s" % infile
            soup = bss(open(infile,'r'))

            ls = []
            # Activity
            for activity in soup.findAll('activity'):

                # Lap
                for lap in activity.findAll('lap'):
                    # Track
                    for track in lap.findAll('track'):

                        # Trackpoint
                        for point in track.findAll('trackpoint'):
                            try:
                                coords = [float(x) for x in
                                         [point.position.longitudedegrees.string,
                                          point.position.latitudedegrees.string]]
                                ls.append(coords)
                            except: coords = None
            if len(ls) > 2:
                output.write({
                    'properties': {
                    },
                    'geometry': mapping(LineString(ls))
                })
Esempio n. 2
0
def play():
    import os, requests
    from BeautifulSoup import BeautifulStoneSoup as bss
    import gluon.contrib.simplejson as sj
    import random

    session.forget()
    lastfmuser = request.vars.lastfmuser
    # print 'lastfmuser', lastfmuser
    api_key = os.environ['LASTFM_API_KEY']
    
    url = 'http://ws.audioscrobbler.com/2.0/?method=user.gettopartists&user=%s&api_key=%s&limit=20' % (lastfmuser, api_key)
    artists = requests.get(url).content
    artists = bss(artists)
    
    artist_names = artists.findAll('name')
    random.shuffle(artist_names, random.random)
    # print artist_names
    
    yt_url = 'http://gdata.youtube.com/feeds/api/videos?q=%s&max-results=5&category=Music&v=2&alt=json'
    yt_list = []
    
    for artist in artist_names:
        a = artist.text.replace(' ', '+')
        # print 'searching: ' + a
        video_json = requests.get(yt_url % a).content
        video = sj.loads(video_json)
        # randomly pick one
        index = random.randint(0,4)
        title = video['feed']['entry'][index]['media$group']['media$title']
        yt_video_id = video['feed']['entry'][index]['media$group']['yt$videoid']
        yt_list.append({'title':title['$t'], 'yt_id':yt_video_id['$t']})
        
    return response.json({'yt_list':yt_list})
Esempio n. 3
0
def getSectionData(securl):
	usock = urllib2.urlopen(securl)
	sections = usock.read()
	usock.close()
	soup = bss(sections, selfClosingTags = ['s'])
	
	return soup
Esempio n. 4
0
def getSectionData(securl):
    usock = urllib2.urlopen(securl)
    sections = usock.read()
    usock.close()
    soup = bss(sections, selfClosingTags=['s'])

    return soup
Esempio n. 5
0
def getSectionText(rev_sec):
    usock = urllib2.urlopen(difftext_url % (rev_sec[0], rev_sec[1], rev_sec[1], rev_sec[2]))
    text = usock.read()
    usock.close()
    soup = bss(text)
    msg = soup.api.query.pages.page.revisions.rev
    content = msg.string
    content = content.strip()

    return content
Esempio n. 6
0
def token_counter(text):
    soup = bss(text)
    plain_text = soup.text
    tokens = defaultdict(int)
    for tok in token_regex.finditer(plain_text):
        token = tok.group(1)
        if token:
            token = token.strip()
        tokens[token] += 1
    return tokens
Esempio n. 7
0
def make_trial_soup():
    xml = ''.join(f_data)
    soup  = bs(xml)
    ssoup = bss(xml)

    trial_soup = [] #each item of list is BeautifulSoup
    for i in ssoup('trial'):
        j = bs(str(i))
        trial_soup.append(j)
    return trial_soup
Esempio n. 8
0
def getSectionNumbers(item):
    page_secs = []
    usock = urllib2.urlopen(diffsec_url % (item[1], item[2]))
    sections = usock.read()
    usock.close()
    soup = bss(sections, selfClosingTags=["s"])
    for x in soup.findAll("s"):
        try:
            page_secs.append(int(x["index"]))
        except ValueError:
            continue
    return page_secs
Esempio n. 9
0
def parse_tcx3(infile):
    from BeautifulSoup import BeautifulStoneSoup as bss
    soup = bss(open(infile,'r'))
    
    ###### Activity
    for activity in soup.findAll('activity'):
        sport = activity['sport']
        activityid = activity.id.string

        ###### Lap
        for lap in activity.findAll('lap'):
            lapid = lap['starttime']
            time = time_code(lapid)
            totaltime = float(lap.totaltimeseconds.string)
            distance = float(lap.distancemeters.string)
            maxspeed = float(lap.maximumspeed.string)
            calories = float(lap.calories.string)
            intensity = lap.intensity.string
            cadence = float(lap.cadence.string)
            trigger = lap.triggermethod.string

            try: avghr = float(lap.averageheartratebpm.value.string)
            except: avghr = None

            try: maxhr = float(lap.maximumheartratebpm.value.string)
            except: maxhr = None
            
            ##### Track
            for track in lap.findAll('track'):

                ##### Trackpoint
                for point in track.findAll('trackpoint'):
                    pointid = point.time.string
                    print pointid
                    time = time_code(pointid)
                    cumdist = float(point.distancemeters.string)

                    try:
                        coords = [float(x) for x in 
                                 [point.position.longitudedegrees.string, 
                                  point.position.latitudedegrees.string]]
                    except: coords = None

                    try: cadence = int(point.cadence.string)
                    except: cadence = None

                    try: alt = float(point.altitudemeters.string)
                    except: alt = None

                    try: hr = int(point.heartratebpm.value.string)
                    except: hr = None
Esempio n. 10
0
def get_problem(number):
    problem= get('http://projecteuler.net/problem='+number, headers={'user-agent': 'euler-init'})
    try:
        if problem.status_code==404:
            raise Exception('Problem {0} does not exist'.format(str(number)))
        elif problem.status_code!=200:
            raise Exception('Error - '+`problem.status_code`)
        html= bs(problem.text)
        question= unicode(bss(html.find('h2').text, convertEntities= bss.ALL_ENTITIES))
        statement= html.find('div', {'class': 'problem_content'}).text
        return question, statement
    except Exception, e:
        print e
        return None
Esempio n. 11
0
def cleanup(infile, outfile):
    # main routine.  takes a file handle for input and output; called at the bottom of the file.
    text = infile.read()
    # custom regexes for things that BeautifulSoup can't handle go here.  Keep to a minimum.  Below examples are for the Encyc.
    # text = re.sub(r"<\?>",r"<gap reason='omitted' unit='character' />",text)
    # text = re.sub(r"<\->",r"<gap reason='omitted' unit='bracket' />",text)
    # text = re.sub(r"<MVO_PIM=\"(.*?)\">",r'<figure><graphic url="\g<1>"></graphic></figure>',text)
    # text = re.sub(r"<omit=(.*?)>",r"<gap reason='omitted' unit='\g<1>' />",text)
    print(len(text))
    soup = bss(text, selfClosingTags=self_closing)
    for tag in soup.findAll():
        if tag.name in fix_case:
            tag.name = fix_case[tag.name]
    print(soup, file=outfile)
    outfile.close()
Esempio n. 12
0
def cleanup(infile,outfile):
# main routine.  takes a file handle for input and output; called at the bottom of the file.
    text = infile.read()
    # custom regexes for things that BeautifulSoup can't handle go here.  Keep to a minimum.  Below examples are for the Encyc.
    # text = re.sub(r"<\?>",r"<gap reason='omitted' unit='character' />",text)
    # text = re.sub(r"<\->",r"<gap reason='omitted' unit='bracket' />",text)
    # text = re.sub(r"<MVO_PIM=\"(.*?)\">",r'<figure><graphic url="\g<1>"></graphic></figure>',text)
    # text = re.sub(r"<omit=(.*?)>",r"<gap reason='omitted' unit='\g<1>' />",text)
    print(len(text))
    soup = bss(text,selfClosingTags=self_closing)
    for tag in soup.findAll():
        if tag.name in fix_case:
            tag.name = fix_case[tag.name]
    print(soup, file=outfile)
    outfile.close()
Esempio n. 13
0
 def fetchLocalResults(self, url):
     result = []
     xml = urlfetch.fetch(url).content
     soup = bss(xml)
     stations = soup.findAll('station')
     for station in stations:
         name = station.find('name')
         band = station.find('band')
         freq = station.find('frequency')
         if name and band and freq:
             res = []
             res.append(str(name.contents[0]))
             channel = band.contents[0] + ' ' + freq.contents[0]
             res.append(str(channel))
             result.append(res)
     return result
Esempio n. 14
0
def page(req, path):
    if splitext(path)[1] == '.html':
        soup = bss(open(join(settings.SPHINX_PROJECT, path)).read())
        head = soup.find('head')
        first_script = Tag(soup, 'script')
        first_script['src'] = "../_static/simplecomment.js" 
        first_script['type'] = "text/javascript"
        second_script = Tag(soup, 'script')
        second_script['src'] = "../_static/jquery.form.js" 
        second_script['type'] = "text/javascript"
        head.insert(-1, first_script)
        head.insert(-1, second_script)
        counter = 0
        page_identity = path.split('.')[0].replace('/', '_')
        for p in soup.findAll('p'):
            p['id'] = '%s_%s' %(page_identity, counter)
            counter += 1
        return HttpResponse(str(soup))
    else:
        return HttpResponse(open(join(settings.SPHINX_PROJECT, path)).read())
Esempio n. 15
0
 def fetchCategoryResults(self, url):
     result = []
     xml = urlfetch.fetch(url).content
     soup = bss(xml)
     stories = soup.findAll('story')
     for story in stories:            
         story_id = story['id']
         titles = story.findAll('title')
         if len(titles) > 0:
             title = titles[0].contents[0]
             mp3_tag = story.findAll('mp3')
             if len(mp3_tag) > 0:
                 mp3 = mp3_tag[0].contents[0]
                 text = str(urlfetch.fetch(mp3).content)
                 text = text.replace('\n', '')
                 index = text.find('.mp3')
                 text = text[:index + 4]
                 title_valid = title.replace(' ', '+').replace(':', '-')
                 res = []
                 res.append(str(title + ''))
                 res.append(text)
                 res.append(title_valid)
                 result.append(res)
     return result
Esempio n. 16
0
def parse_tcx3(infile, lyr):
    soup = bss(open(infile,'r'))

    # Activity
    for activity in soup.findAll('activity'):
        sport = activity['sport']
        activityid = activity.id.string

        # Lap
        for lap in activity.findAll('lap'):
            # Track
            for track in lap.findAll('track'):

                # Trackpoint
                for point in track.findAll('trackpoint'):
                    feature = {}
                    pointid = point.time.string
                    feature['time'] = time_code(pointid)
                    feature['activityid'] = time_code(activityid)
                    try:
                        feature['speed'] = float(point.find('ns3:speed').string)
                    except: feature['speed'] = 0

                    try:
                        feature['coords'] = [float(x) for x in
                                 [point.position.longitudedegrees.string,
                                  point.position.latitudedegrees.string]]
                    except: coords = None

                    try: feature['alt'] = float(point.altitudemeters.string)
                    except: feature['alt'] = 0

                    try: feature['bpm'] = int(point.heartratebpm.value.string)
                    except: feature['bpm'] = 0

                    add_feature(lyr, feature)
Esempio n. 17
0
#!/usr/bin/env python

import sys,urllib, re, string
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup as bss
from subprocess import *

imgurl = sys.argv[1]
print "Getting page title.."
soup = BeautifulSoup(urllib.urlopen(imgurl))

title = str(soup.title.string).strip()
raw_query = unicode(bss(title, convertEntities=bss.ALL_ENTITIES))

print "Found this:\n", raw_query

newurl = "http://www.reddit.com/search?q=" + raw_query

print "Searching on reddit.."
new_soup = BeautifulSoup(urllib.urlopen(newurl))

results = new_soup.findAll('a', {"class" : "comments"})
if len(results) == 0:
    print 'Nothing found, sorry.' 
    sys.exit(1)
print "Found top result:\n", results[0]
go_here = results[0]['href']
print go_here

commands = [
        'open',
Esempio n. 18
0
def format(text):
    return bss(compressWhiteSpace.sub(' ', stripHtmlTags.sub('', text)), convertEntities=bss.ALL_ENTITIES)
<br/>
</noinclude>
%s
'''

wiki = wikitools.Wiki(config_reader.get("apiurl"))
wiki.login(config_reader.get("username"), config_reader.get("password"))

# get the number of new profiles on the Guest_left page

securl = u'http://en.wikipedia.org/w/api.php?action=parse&page=Wikipedia%3ATeahouse%2FGuests%2FRight_column&prop=sections&format=xml'

usock = urllib2.urlopen(securl)
sections = usock.read()
usock.close()
soup = bss(sections, selfClosingTags = ['s'])
seccount = soup.findAll('s')

if len(seccount) < 11:
	pass
else:
	# if there are more than 10 intros, get the text of the intros being archived
	i = len(seccount) #just changed this to be the seccount number, and changed the while loop to count down
	j = i - 10
	output = []
	returns = []

	while i > 1:
		texturl1 = u'http://en.wikipedia.org/w/index.php?title=Wikipedia%%3ATeahouse%%2FGuests%%2FRight_column&action=raw&section=%d' % i
		usock = urllib2.urlopen(texturl1)
		sections1 = usock.read()
Esempio n. 20
0
        for commentmeta in item.findAll('wp:commentmeta'):
            commentmeta.extract()
        item.find('wp:is_sticky').extract()
        item.find('wp:ping_status').extract()
        item.find('wp:comment_status').extract()
        # item.find('wp:post_date').extract()

def make_post(item):
    title = item.title.text
    desc = item.description
    post_name = desc.find('wp:post_name').text
    content = unicode(desc.find('content:encoded').text)
    status = desc.find('wp:status').text
    if status == 'draft':
        post = Draft(title, post_name, content)
    elif status == 'publish':
        date_gmt = desc.find('wp:post_date_gmt').text
        post = Publish(title, post_name, content, date_gmt)
    return post


if __name__ == "__main__":    
    data = bss(file(XML_FILE).read(), 
               convertEntities=bss.ALL_ENTITIES)

    items = data.contents[6].findAll('item')
    cleanup_items(items)
    posts = map(make_post, items)
    
    map(lambda post: post.write_to_file(BLOG_DIR), posts)
Esempio n. 21
0
def soupify(url):
    return bss(urlopen(url))
Esempio n. 22
0
def bs(html):
    return bss(html)
Esempio n. 23
0
def bs(html):
	return bss(html)
Esempio n. 24
0
            total = census
    
        #Annoyingly, BeautifulSoup requires you to declare ALL self-closing tags yourself; it will badly mangle your text if you miss one, so get this right.
        self_closing = []

        #BeautifulSoup lowercases all element names; to get things closer to standard TEI, I've included a list here which I use to restore them after parsing
        fix_case = {}
    
        for tag in list(census.tags.keys()):    
            fix_case[tag.lower()] = tag
            if census[tag]["empty"] != 0:
                self_closing.append(tag)
        
        ## Add any custom regex here
        
        soup = bss(text,selfClosingTags=self_closing)
        for tag in soup.findAll():
            if tag.name in fix_case:
                tag.name = fix_case[tag.name]
        
        file_contents = str(soup)
        file_contents = convert_remaining_entities(file_contents, quiet)
        
        try:
            parser = etree.XMLParser(huge_tree=True, remove_blank_text=True, strip_cdata=False)
            tree = etree.fromstring(file_contents, parser=parser)
            for el in tree.iter():
                ## Tags are defined as el.tag, so to change tag, you do: el.tag = "some_other_tag"
                ## Attributes are contained in el.attrib where each attribute is a key. To change the type attribute you do: el.attrib['type'] = "some_other_type"
                if el.tag in xml_tag_mapping: ## Check if the tag should be replaced according to the xml mapping dict
                    el.tag = xml_tag_mapping[el.tag]
Esempio n. 25
0
<br/>
</noinclude>
%s
'''

wiki = wikitools.Wiki(settings.apiurl)
wiki.login(settings.username, settings.password)

# get the number of new profiles on the Guest_left page

securl = u'http://en.wikipedia.org/w/api.php?action=parse&page=Wikipedia%3ATeahouse%2FGuests%2FRight_column&prop=sections&format=xml'

usock = urllib2.urlopen(securl)
sections = usock.read()
usock.close()
soup = bss(sections, selfClosingTags=['s'])
seccount = soup.findAll('s')

if len(seccount) < 11:
    pass
else:
    # if there are more than 10 intros, get the text of the intros being archived
    i = len(
        seccount
    )  #just changed this to be the seccount number, and changed the while loop to count down
    j = i - 10
    output = []
    returns = []

    while i > 1:
        texturl1 = u'http://en.wikipedia.org/w/index.php?title=Wikipedia%%3ATeahouse%%2FGuests%%2FRight_column&action=raw&section=%d' % i
Esempio n. 26
0
def soupify(url):
  return bss(urlopen(url));
Esempio n. 27
0
def bs(html):
    return bss(html, "html.parser")