Ejemplo n.º 1
0
def get_video_providers(url):
    html = fetchUrl(url)
    soup = BeautifulSoup(html)
    generic_video_items = soup.findSelect('div.generic-video-item')
    result = []
    for item in generic_video_items:
        ssoup = BeautifulSoup(str(item.contents))
        video_page = ssoup.findSelect('div.thumb')
        if (not video_page[0].a):
            provider_url = url
            temp = BeautifulSoup(str(video_page[0].contents))
            span = temp.findAll('span')
            provider_name = span[1].string
        else:
            provider_name = video_page[0].a.center.span.string
            provider_url = video_page[0].a['href']

        data = {'provider_name': provider_name, 
            'provider_url': provider_url}

        result.append(data)

    return result
Ejemplo n.º 2
0
    def test_monkeypatch_implicit(self):
        soup = BeautifulSoup(HTML)
        self.assertRaises(TypeError, soup.findSelect, '*')

        monkeypatch()

        self.assert_(soup.findSelect('*'))
        self.assertSelectMultipleExplicit(soup,
            ('link', ['l1']),
            ('div#main', ['main']),
            ('div div', ['inner']),
        )

        unmonkeypatch()

        self.assertRaises(TypeError, soup.findSelect, '*')
Ejemplo n.º 3
0
    def _analyze_response(self, response):
        soup = BeautifulSoup(response)

        success = True
        for condition in self._conditions:
            self.logger.debug('Condition: {0}'.format(condition))

            selector = condition.get('selector')
            pattern = condition.get('pattern')
            found = condition.get('found')

            if selector == "_text":
                result = soup.find(text=pattern)
            else:
                result = soup.findSelect(selector)

            if found != bool(result):
                success = False
                break

        return success
Ejemplo n.º 4
0
    def _analyze_response(self, response):
        soup = BeautifulSoup(response)

        success = True
        for condition in self._conditions:
            self.logger.debug('Condition: {0}'.format(condition))

            selector = condition.get('selector')
            pattern = condition.get('pattern')
            found = condition.get('found')

            if selector == "_text":
                result = soup.find(text=pattern)
            else:
                result = soup.findSelect(selector)

            if found != bool(result):
                success = False
                break

        return success
Ejemplo n.º 5
0
def download_post(url):
    response = requests.get(
        url,
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15'
        })

    bs = BeautifulSoup(response.text, 'html5lib')
    title = bs.findSelect('a.fw_post_author')[0].text
    try:
        album = bs.findSelect('.wall_post_text')[0].text.split('\n')[0].strip()
    except IndexError:
        album = ''
    album_id = bs.findSelect('.fw_like_count')[0]['id'].replace(
        'like_count', '')
    try:
        cover = bs.findSelect('.page_media_thumb1 img')[0]['src']
    except IndexError:
        cover = None
    print title, '-', album
    songs = [{
        'url':
        input['value'].split(',')[0],
        'title':
        bs.findSelect('#audio%s .title_wrap' %
                      input['id'].replace('audio_info', ''))[0].text
    } for input in bs.findSelect('input[type=hidden]')
             if input.has_key('id') and input['id'].startswith('audio')]

    # Creating folder
    target_dir = os.path.join('/tmp/vk-post-downloader/', album_id)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Downloading
    print '', title, '-', album_id
    if cover:
        download(cover, target_dir, 'cover.jpg', ' Cover')
        cover_filename = os.path.join(target_dir, 'cover.jpg')

        try:
            from PIL import Image
            image = Image.open(cover_filename)
            size = [min(image.size), min(image.size)]
            background = Image.new('RGBA', size, (255, 255, 255, 0))
            background.paste(image, ((size[0] - image.size[0]) / 2,
                                     (size[1] - image.size[1]) / 2))
            background.save(cover_filename, format='jpeg')
        except ImportError:
            print u'PIL не найден. Вы можете попробовать его установить командой easy_install PIL'
            print u'Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных'

    print ' MP3s:'
    for i, song in enumerate(songs):
        download(song['url'], target_dir, '%d.mp3' % (i + 1),
                 '  - ' + song['title'])

    # Parsing
    for f in os.listdir(target_dir):
        if not f.endswith('.mp3'):
            continue
        filename = os.path.join(target_dir, f)
        try:
            id3 = ID3(filename, translate=False)
        except mutagen.id3.ID3NoHeaderError:
            id3 = mutagen.id3.ID3()
        id3.unknown_frames = getattr(id3, 'unknown_frames', [])
        id3.update_to_v24()
        id3.add(TPE2(encoding=3, text=title))
        if album:
            id3.add(TAL(encoding=3, text=album))
        id3.add(TCMP(encoding=3, text='1'))
        id3.add(TRCK(encoding=3, text=''))
        if cover:
            id3.add(
                APIC(
                    encoding=3,  # 3 is for utf-8
                    mime='image/jpeg',  # image/jpeg or image/png
                    type=3,  # 3 is for the cover image
                    desc=u'Cover',
                    data=open(cover_filename).read()))
        id3.save(filename)
        shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f))

    os.system('rm -rf %s' % target_dir)
Ejemplo n.º 6
0
def get_page(page_num):
    chrome_user_agent = "User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.33 (KHTML, like Gecko) Chrome/27.0.1430.0 Safari/537.33"
    users = []
    request = urllib2.Request('http://vimeo.com/channels/staffpicks/subscribers/page:{0}/sort:datesubscribe'.format(page_num))
    request.add_header('User-Agent', chrome_user_agent)
    opener = urllib2.build_opener()
    page = opener.open(request).read()
    soup = BeautifulSoup(page)
    # soup.prettify()

    for a in soup.findSelect("ol.js-browse_list li a"):
        name = a['href']
        has_video_in_staff_pick = False
        url = 'http://vimeo.com{0}'.format(name)
        request = urllib2.Request(url)
        request.add_header('User-Agent', chrome_user_agent)
        opener = urllib2.build_opener()
        profile_page = opener.open(request).read()
        profile_soup = BeautifulSoup(profile_page)

        videos_api_url = "http://vimeo.com/api/v2/{0}/videos.json".format(name.replace('/', ''))
        videos = simplejson.loads(urllib2.urlopen(videos_api_url).read())
        for video in videos:
            request = urllib2.Request(video.get('url'))
            request.add_header('User-Agent', chrome_user_agent)
            opener = urllib2.build_opener()
            video_page = opener.open(request).read()
            video_soup = BeautifulSoup(video_page)
            for script in video_soup.findSelect("script"):
                if script.prettify().find('{"badge":{"name":"staffpicks"') > 0:
                    has_video_in_staff_pick = True
                    break

        pro = profile_soup.findSelect("div#profile span.badge_pro")
        plus = profile_soup.findSelect("div#profile span.badge_plus")
        vidoes = 0
        try:
            vidoes = int(profile_soup.findSelect("div#cols ul.pivots li")[0]['data-count'])
        except:
            pass


        u = User(
            name=unicode(a['title']).encode('utf-8', 'replace'),
            url=url,
            is_paying_user=True if pro or plus else False,
            has_video_in_staff_pick=has_video_in_staff_pick,
            has_atleast_one_video=vidoes > 0
        )


        try:
            logging.info(u"Going to try and save: {0}".format(u))
        except:
            logging.exception("Unable to log user info: ")
            pass

        try:
            u.save()
            users.append(u)
        except:
            logging.exception("Unable to save user: ")
            pass

    return users
Ejemplo n.º 7
0
def download_post(url):
    response = requests.get(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15"
        },
    )

    bs = BeautifulSoup(response.text, "html5lib")
    title = bs.findSelect("a.fw_post_author")[0].text
    try:
        album = bs.findSelect(".wall_post_text")[0].text.split("\n")[0].strip()
    except IndexError:
        album = ""
    album_id = bs.findSelect(".fw_like_count")[0]["id"].replace("like_count", "")
    try:
        cover = bs.findSelect(".page_media_thumb1 img")[0]["src"]
    except IndexError:
        cover = None
    print title, "-", album
    songs = [
        {
            "url": input["value"].split(",")[0],
            "title": bs.findSelect("#audio%s .title_wrap" % input["id"].replace("audio_info", ""))[0].text,
        }
        for input in bs.findSelect("input[type=hidden]")
        if input.has_key("id") and input["id"].startswith("audio")
    ]

    # Creating folder
    target_dir = os.path.join("/tmp/vk-post-downloader/", album_id)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Downloading
    print "", title, "-", album_id
    if cover:
        download(cover, target_dir, "cover.jpg", " Cover")
        cover_filename = os.path.join(target_dir, "cover.jpg")

        try:
            from PIL import Image

            image = Image.open(cover_filename)
            size = [min(image.size), min(image.size)]
            background = Image.new("RGBA", size, (255, 255, 255, 0))
            background.paste(image, ((size[0] - image.size[0]) / 2, (size[1] - image.size[1]) / 2))
            background.save(cover_filename, format="jpeg")
        except ImportError:
            print u"PIL не найден. Вы можете попробовать его установить командой easy_install PIL"
            print u"Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных"

    print " MP3s:"
    for i, song in enumerate(songs):
        download(song["url"], target_dir, "%d.mp3" % (i + 1), "  - " + song["title"])

    # Parsing
    for f in os.listdir(target_dir):
        if not f.endswith(".mp3"):
            continue
        filename = os.path.join(target_dir, f)
        try:
            id3 = ID3(filename, translate=False)
        except mutagen.id3.ID3NoHeaderError:
            id3 = mutagen.id3.ID3()
        id3.unknown_frames = getattr(id3, "unknown_frames", [])
        id3.update_to_v24()
        id3.add(TPE2(encoding=3, text=title))
        if album:
            id3.add(TAL(encoding=3, text=album))
        id3.add(TCMP(encoding=3, text="1"))
        id3.add(TRCK(encoding=3, text=""))
        if cover:
            id3.add(
                APIC(
                    encoding=3,  # 3 is for utf-8
                    mime="image/jpeg",  # image/jpeg or image/png
                    type=3,  # 3 is for the cover image
                    desc=u"Cover",
                    data=open(cover_filename).read(),
                )
            )
        id3.save(filename)
        shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f))

    os.system("rm -rf %s" % target_dir)
import os
import re
import json
import requests
import soupselect
from bs4 import BeautifulSoup

soupselect.monkeypatch()

scraper_dir = os.path.dirname(os.path.realpath(__file__))
urls_file = open(scraper_dir + '/urls.txt', 'r')

recipes = []
id_re = re.compile('/recipes/(?P<id>[\d]+)/.+')

for url in urls_file:
    url = url.strip()

    r = requests.get(url)
    soup = BeautifulSoup(r.text)

    recipes.append({
        'id': id_re.search(url).group('id'),
        'title': soup.findSelect('h1.article')[0].text.strip(),
        'teaser': soup.findSelect('.recipe-summary .summary')[0].text.strip(),
        'imageUrl': soup.findSelect('.articleImage .photo')[0]['src'],
        'url': url
    })

print(json.dumps(recipes))
Ejemplo n.º 9
0
def download_post(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15'})

    bs = BeautifulSoup(response.text, 'html5lib')
    title = bs.findSelect('a.fw_post_author')[0].text
    album = bs.findSelect('.wall_post_text')[0].text.split('\n')[0].strip()
    album_id = bs.findSelect('.fw_like_count')[0]['id'].split('-')[1]
    cover =  bs.findSelect('.page_media_thumb1 img')[0]['src']
    print title, '-', album
    songs = [{
        'url': input['value'].split(',')[0],
        'title': bs.findSelect('#audio%s .title_wrap' % input['id'].replace('audio_info', ''))[0].text
        } for input in bs.findSelect('input[type=hidden]') if input.has_key('id') and input['id'].startswith('audio')
    ]

    # Creating folder
    target_dir = os.path.join('/tmp/vk-post-downloader/',album_id)
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Downloading
    print '', title, '-', album_id
    download(cover, target_dir, 'cover.jpg', ' Cover')
    cover_filename = os.path.join(target_dir, 'cover.jpg')

    try:
        from PIL import Image
        image = Image.open(cover_filename)
        size = [min(image.size), min(image.size)]
        background = Image.new('RGBA', size, (255, 255, 255, 0))
        background.paste(
            image,
            ((size[0] - image.size[0]) / 2, (size[1] - image.size[1]) / 2))
        background.save(cover_filename, format='jpeg')
    except ImportError:
        print u'PIL не найден. Вы можете попробовать его установить командой easy_install PIL'
        print u'Ничего страшного, просто прямоугольные картинки для обложки не будут обрезаться до квадратных'

    print ' MP3s:'
    for i, song in enumerate(songs):
        download(song['url'], target_dir, '%d.mp3' % (i+1), '  - ' + song['title'])

    # Parsing
    for f in os.listdir(target_dir):
        if not f.endswith('.mp3'):
            continue
        filename = os.path.join(target_dir, f)
        try:
            id3 = ID3(filename, translate=False)
        except mutagen.id3.ID3NoHeaderError:
            id3 = mutagen.id3.ID3()
        id3.unknown_frames = getattr(id3, 'unknown_frames', [])
        id3.update_to_v24()
        id3.add(TPE2(encoding=3, text=title))
        id3.add(TAL(encoding=3, text=album))
        id3.add(TCMP(encoding=3, text='1'))
        id3.add(TRCK(encoding=3, text=''))
        id3.add(
            APIC(
                encoding=3, # 3 is for utf-8
                mime='image/jpeg', # image/jpeg or image/png
                type=3, # 3 is for the cover image
                desc=u'Cover',
                data=open(cover_filename).read()
            )
        )
        id3.save(filename)
        shutil.copyfile(filename, os.path.join(itunes_autoimport_dir, f))

    os.system('rm -rf %s' % target_dir)