def get_video_providers(url): html = fetchUrl(url) soup = BeautifulSoup(html) generic_video_items = soup.findSelect('div.generic-video-item') result = [] for item in generic_video_items: ssoup = BeautifulSoup(str(item.contents)) video_page = ssoup.findSelect('div.thumb') if (not video_page[0].a): provider_url = url temp = BeautifulSoup(str(video_page[0].contents)) span = temp.findAll('span') provider_name = span[1].string else: provider_name = video_page[0].a.center.span.string provider_url = video_page[0].a['href'] data = {'provider_name': provider_name, 'provider_url': provider_url} result.append(data) return result
def test_monkeypatch_implicit(self): soup = BeautifulSoup(HTML) self.assertRaises(TypeError, soup.findSelect, '*') monkeypatch() self.assert_(soup.findSelect('*')) self.assertSelectMultipleExplicit(soup, ('link', ['l1']), ('div#main', ['main']), ('div div', ['inner']), ) unmonkeypatch() self.assertRaises(TypeError, soup.findSelect, '*')
def _analyze_response(self, response): soup = BeautifulSoup(response) success = True for condition in self._conditions: self.logger.debug('Condition: {0}'.format(condition)) selector = condition.get('selector') pattern = condition.get('pattern') found = condition.get('found') if selector == "_text": result = soup.find(text=pattern) else: result = soup.findSelect(selector) if found != bool(result): success = False break return success
def download_post(url): response = requests.get( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15' }) bs = BeautifulSoup(response.text, 'html5lib') title = bs.findSelect('a.fw_post_author')[0].text try: album = bs.findSelect('.wall_post_text')[0].text.split('\n')[0].strip() except IndexError: album = '' album_id = bs.findSelect('.fw_like_count')[0]['id'].replace( 'like_count', '') try: cover = bs.findSelect('.page_media_thumb1 img')[0]['src'] except IndexError: cover = None print title, '-', album songs = [{ 'url': input['value'].split(',')[0], 'title': bs.findSelect('#audio%s .title_wrap' % input['id'].replace('audio_info', ''))[0].text } for input in bs.findSelect('input[type=hidden]') if input.has_key('id') and input['id'].startswith('audio')] # Creating folder target_dir = os.path.join('/tmp/vk-post-downloader/', album_id) if not os.path.exists(target_dir): os.makedirs(target_dir) # Downloading print '', title, '-', album_id if cover: download(cover, target_dir, 'cover.jpg', ' Cover') cover_filename = os.path.join(target_dir, 'cover.jpg') try: from PIL import Image image = Image.open(cover_filename) size = [min(image.size), min(image.size)] background = Image.new('RGBA', size, (255, 255, 255, 0)) background.paste(image, ((size[0] - image.size[0]) / 2, (size[1] - image.size[1]) / 2)) background.save(cover_filename, format='jpeg') except ImportError: print u'PIL не найден. Вы можете попробовать его установить командой easy_install PIL' print u'Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных' print ' MP3s:' for i, song in enumerate(songs): download(song['url'], target_dir, '%d.mp3' % (i + 1), ' - ' + song['title']) # Parsing for f in os.listdir(target_dir): if not f.endswith('.mp3'): continue filename = os.path.join(target_dir, f) try: id3 = ID3(filename, translate=False) except mutagen.id3.ID3NoHeaderError: id3 = mutagen.id3.ID3() id3.unknown_frames = getattr(id3, 'unknown_frames', []) id3.update_to_v24() id3.add(TPE2(encoding=3, text=title)) if album: id3.add(TAL(encoding=3, text=album)) id3.add(TCMP(encoding=3, text='1')) id3.add(TRCK(encoding=3, text='')) if cover: id3.add( APIC( encoding=3, # 3 is for utf-8 mime='image/jpeg', # image/jpeg or image/png type=3, # 3 is for the cover image desc=u'Cover', data=open(cover_filename).read())) id3.save(filename) shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f)) os.system('rm -rf %s' % target_dir)
def get_page(page_num): chrome_user_agent = "User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.33 (KHTML, like Gecko) Chrome/27.0.1430.0 Safari/537.33" users = [] request = urllib2.Request('http://vimeo.com/channels/staffpicks/subscribers/page:{0}/sort:datesubscribe'.format(page_num)) request.add_header('User-Agent', chrome_user_agent) opener = urllib2.build_opener() page = opener.open(request).read() soup = BeautifulSoup(page) # soup.prettify() for a in soup.findSelect("ol.js-browse_list li a"): name = a['href'] has_video_in_staff_pick = False url = 'http://vimeo.com{0}'.format(name) request = urllib2.Request(url) request.add_header('User-Agent', chrome_user_agent) opener = urllib2.build_opener() profile_page = opener.open(request).read() profile_soup = BeautifulSoup(profile_page) videos_api_url = "http://vimeo.com/api/v2/{0}/videos.json".format(name.replace('/', '')) videos = simplejson.loads(urllib2.urlopen(videos_api_url).read()) for video in videos: request = urllib2.Request(video.get('url')) request.add_header('User-Agent', chrome_user_agent) opener = urllib2.build_opener() video_page = opener.open(request).read() video_soup = BeautifulSoup(video_page) for script in video_soup.findSelect("script"): if script.prettify().find('{"badge":{"name":"staffpicks"') > 0: has_video_in_staff_pick = True break pro = profile_soup.findSelect("div#profile span.badge_pro") plus = profile_soup.findSelect("div#profile span.badge_plus") vidoes = 0 try: vidoes = int(profile_soup.findSelect("div#cols ul.pivots li")[0]['data-count']) except: pass u = User( name=unicode(a['title']).encode('utf-8', 'replace'), url=url, is_paying_user=True if pro or plus else False, has_video_in_staff_pick=has_video_in_staff_pick, has_atleast_one_video=vidoes > 0 ) try: logging.info(u"Going to try and save: {0}".format(u)) except: logging.exception("Unable to log user info: ") pass try: u.save() users.append(u) except: logging.exception("Unable to save user: ") pass return users
def download_post(url): response = requests.get( url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15" }, ) bs = BeautifulSoup(response.text, "html5lib") title = bs.findSelect("a.fw_post_author")[0].text try: album = bs.findSelect(".wall_post_text")[0].text.split("\n")[0].strip() except IndexError: album = "" album_id = bs.findSelect(".fw_like_count")[0]["id"].replace("like_count", "") try: cover = bs.findSelect(".page_media_thumb1 img")[0]["src"] except IndexError: cover = None print title, "-", album songs = [ { "url": input["value"].split(",")[0], "title": bs.findSelect("#audio%s .title_wrap" % input["id"].replace("audio_info", ""))[0].text, } for input in bs.findSelect("input[type=hidden]") if input.has_key("id") and input["id"].startswith("audio") ] # Creating folder target_dir = os.path.join("/tmp/vk-post-downloader/", album_id) if not os.path.exists(target_dir): os.makedirs(target_dir) # Downloading print "", title, "-", album_id if cover: download(cover, target_dir, "cover.jpg", " Cover") cover_filename = os.path.join(target_dir, "cover.jpg") try: from PIL import Image image = Image.open(cover_filename) size = [min(image.size), min(image.size)] background = Image.new("RGBA", size, (255, 255, 255, 0)) background.paste(image, ((size[0] - image.size[0]) / 2, (size[1] - image.size[1]) / 2)) background.save(cover_filename, format="jpeg") except ImportError: print u"PIL не найден. Вы можете попробовать его установить командой easy_install PIL" print u"Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных" print " MP3s:" for i, song in enumerate(songs): download(song["url"], target_dir, "%d.mp3" % (i + 1), " - " + song["title"]) # Parsing for f in os.listdir(target_dir): if not f.endswith(".mp3"): continue filename = os.path.join(target_dir, f) try: id3 = ID3(filename, translate=False) except mutagen.id3.ID3NoHeaderError: id3 = mutagen.id3.ID3() id3.unknown_frames = getattr(id3, "unknown_frames", []) id3.update_to_v24() id3.add(TPE2(encoding=3, text=title)) if album: id3.add(TAL(encoding=3, text=album)) id3.add(TCMP(encoding=3, text="1")) id3.add(TRCK(encoding=3, text="")) if cover: id3.add( APIC( encoding=3, # 3 is for utf-8 mime="image/jpeg", # image/jpeg or image/png type=3, # 3 is for the cover image desc=u"Cover", data=open(cover_filename).read(), ) ) id3.save(filename) shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f)) os.system("rm -rf %s" % target_dir)
import os import re import json import requests import soupselect from bs4 import BeautifulSoup soupselect.monkeypatch() scraper_dir = os.path.dirname(os.path.realpath(__file__)) urls_file = open(scraper_dir + '/urls.txt', 'r') recipes = [] id_re = re.compile('/recipes/(?P<id>[\d]+)/.+') for url in urls_file: url = url.strip() r = requests.get(url) soup = BeautifulSoup(r.text) recipes.append({ 'id': id_re.search(url).group('id'), 'title': soup.findSelect('h1.article')[0].text.strip(), 'teaser': soup.findSelect('.recipe-summary .summary')[0].text.strip(), 'imageUrl': soup.findSelect('.articleImage .photo')[0]['src'], 'url': url }) print(json.dumps(recipes))
def download_post(url): response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15'}) bs = BeautifulSoup(response.text, 'html5lib') title = bs.findSelect('a.fw_post_author')[0].text album = bs.findSelect('.wall_post_text')[0].text.split('\n')[0].strip() album_id = bs.findSelect('.fw_like_count')[0]['id'].split('-')[1] cover = bs.findSelect('.page_media_thumb1 img')[0]['src'] print title, '-', album songs = [{ 'url': input['value'].split(',')[0], 'title': bs.findSelect('#audio%s .title_wrap' % input['id'].replace('audio_info', ''))[0].text } for input in bs.findSelect('input[type=hidden]') if input.has_key('id') and input['id'].startswith('audio') ] # Creating folder target_dir = os.path.join('/tmp/vk-post-downloader/',album_id) if not os.path.exists(target_dir): os.makedirs(target_dir) # Downloading print '', title, '-', album_id download(cover, target_dir, 'cover.jpg', ' Cover') cover_filename = os.path.join(target_dir, 'cover.jpg') try: from PIL import Image image = Image.open(cover_filename) size = [min(image.size), min(image.size)] background = Image.new('RGBA', size, (255, 255, 255, 0)) background.paste( image, ((size[0] - image.size[0]) / 2, (size[1] - image.size[1]) / 2)) background.save(cover_filename, format='jpeg') except ImportError: print u'PIL не найден. Вы можете попробовать его установить командой easy_install PIL' print u'Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных' print ' MP3s:' for i, song in enumerate(songs): download(song['url'], target_dir, '%d.mp3' % (i+1), ' - ' + song['title']) # Parsing for f in os.listdir(target_dir): if not f.endswith('.mp3'): continue filename = os.path.join(target_dir, f) try: id3 = ID3(filename, translate=False) except mutagen.id3.ID3NoHeaderError: id3 = mutagen.id3.ID3() id3.unknown_frames = getattr(id3, 'unknown_frames', []) id3.update_to_v24() id3.add(TPE2(encoding=3, text=title)) id3.add(TAL(encoding=3, text=album)) id3.add(TCMP(encoding=3, text='1')) id3.add(TRCK(encoding=3, text='')) id3.add( APIC( encoding=3, # 3 is for utf-8 mime='image/jpeg', # image/jpeg or image/png type=3, # 3 is for the cover image desc=u'Cover', data=open(cover_filename).read() ) ) id3.save(filename) shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f)) os.system('rm -rf %s' % target_dir)