Ejemplo n.º 1
0
def get_data(piratebayId):
    _key_map = {
      'spoken language(s)': u'language',
      'texted language(s)': u'subtitle language',
      'by': u'uploader',
      'leechers': 'leecher',
      'seeders': 'seeder',
    }
    piratebayId = get_id(piratebayId)
    torrent = dict()
    torrent[u'id'] = piratebayId
    torrent[u'domain'] = 'thepiratebay.org'
    torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId

    data = read_url(torrent['comment_link'], unicode=True)
    torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
    if not torrent[u'title']:
        return None
    torrent[u'title'] = decode_html(torrent[u'title']).strip()
    torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
    title = quote(torrent['title'].encode('utf-8'))
    torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
    for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
        key = d[0].lower().strip()
        key = _key_map.get(key, key)
        value = decode_html(strip_tags(d[1].strip()))
        torrent[key] = value
    torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
    if torrent[u'description']:
        torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
    t = read_url(torrent[u'torrent_link'])
    torrent[u'torrent_info'] = get_torrent_info(t)
    return torrent
Ejemplo n.º 2
0
def get_data(mininovaId):
    _key_map = {
        'by': u'uploader',
    }
    mininovaId = get_id(mininovaId)
    torrent = dict()
    torrent[u'id'] = mininovaId
    torrent[u'domain'] = 'mininova.org'
    torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
    torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
    torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId

    data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
    if '<h1>Torrent not found...</h1>' in data:
        return None

    for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
        key = d[0].lower().strip()
        key = _key_map.get(key, key)
        value = decode_html(strip_tags(d[1].strip()))
        torrent[key] = value

    torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
    torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
    torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
    if torrent['description']:
        torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
    t = read_url(torrent[u'torrent_link'])
    torrent[u'torrent_info'] = get_torrent_info(t)
    return torrent
Ejemplo n.º 3
0
def get_data(isbn):
    r = {}
    url = '%s/Search/Book/%s/1' % (base, isbn)

    data = read_url(url).decode('utf-8')
    m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
    if m:
        ids = m[0].split('/')
        r['isbn'] = ids[-2]
        r['asin'] = ids[-3]
        url = '%s%s' % (base, m[0])
        data = read_url(url).decode('utf-8')
        r["title"] = find_re(data, "<h2>(.*?)</h2>")
        keys = {
            'author': 'Author(s)',
            'publisher': 'Publisher',
            'date': 'Publication date',
            'edition': 'Edition',
            'binding': 'Binding',
            'volume': 'Volume(s)',
            'pages': 'Pages',
        }
        for key in keys:
            r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
            if r[key] == '--':
                r[key] = ''
            if key == 'pages' and r[key]:
                r[key] = int(r[key])
        desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
        desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
        r['description'] = strip_tags(desc).strip()
        if r['description'] == u'Description of this item is not available at this time.':
            r['description'] = ''
        r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    return r
Ejemplo n.º 4
0
def get_show_data(url):
    data = read_url(url, unicode=True)
    r = {}
    r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
    r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
    r['episodes'] = {}
    #1.   1- 1       1001      7 Aug 05   You Can't Miss the Bear
    for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
        air_date = episode[3].strip()
        #'22 Sep 04' -> 2004-09-22 
        try:
            air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
        except:
            pass
        s = episode[1].split('-')[0].strip()
        e = episode[1].split('-')[-1].strip()
        try:
            r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
                'prod code': episode[2],
                'air date': air_date,
                'url': episode[4],
                'title':episode[5],
            }
        except:
            print "oxweb.epguides failed,", url
    return r
Ejemplo n.º 5
0
def get_id(piratebayId):
    if piratebayId.startswith('http://torrents.thepiratebay.org/'):
        piratebayId = piratebayId.split('org/')[1]
    d = find_re(piratebayId, "tor/(\d+)")
    if d:
        piratebayId = d
    d = find_re(piratebayId, "torrent/(\d+)")
    if d:
        piratebayId = d
    return piratebayId
Ejemplo n.º 6
0
def get_data(id):
    url = "http://www.amazon.com/title/dp/%s/" % id
    data = read_url(url, unicode=True)


    def find_data(key):
        return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()

    r = {}
    r['amazon'] = url
    r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
    r['authors'] = []
    doc = lxml.html.document_fromstring(data)
    for e in doc.xpath("//span[contains(@class, 'author')]"):
        print e
        for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
            if 'Author' in secondary.text:
                author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
                if author:
                    r['authors'].append(author[0].text.strip())
                else:
                    r['authors'].append(e.xpath('.//a')[0].text.strip())
                break
            elif 'Translator' in secondary.text:
                r['translator'] = [e.xpath('.//a')[0].text]
                break
    r['publisher'] = find_data('Publisher')
    r['language'] = find_data('Language')
    r['isbn-10'] = find_data('ISBN-10')
    r['isbn-13'] = find_data('ISBN-13').replace('-', '')
    r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')

    r['pages'] = find_data('Paperback')
    if not r['pages']:
        r['pages'] = find_data('Hardcover')

    r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()

    for e in doc.xpath('//noscript'):
        for c in e.getchildren():
            if c.tag == 'div':
                r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
                break

    r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
    if r['cover']:
        r['cover'] = r['cover'][0].split('._BO2')[0]
        if not r['cover'].endswith('.jpg'):
            r['cover'] = r['cover'] + '.jpg'
        if 'no-image-avail-img' in r['cover']:
            del r['cover']
    else:
        del r['cover']
    return r
Ejemplo n.º 7
0
    def __init__(self, id, timeout=-1):
        url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
        '''
            "http://graph.freebase.com/imdb.title.tt%s" % id
            might also be of interest at some point, right now not much info
        '''
        data = read_url(url, unicode=True)
        try:
            data = json.loads(data)
        except ValueError:
            return
        '''
        for key in data:
            self[key] = data[key]
        '''
        for key in ('id', 'guid', 'name'):
            self[key] = data[key]
        keys = {
            'wikipedia': '/wikipedia/en',
            'netflix': '/authority/netflix/movie',
            'nytimes': '/source/nytimes/movie',
            'metacritic': '/source/metacritic/movie',
        }
        for key in keys:
            links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
            if links:
                self[key] = links[0]['uri']

        if 'nytimes' in self:
            self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
            self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
Ejemplo n.º 8
0
def get_book(id):
    if isinstance(id, basestring) and id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    html = ox.cache.read_url(url, unicode=True)
    data = {}
    data['url'] = url
    pages = []
    page = get_page(url)
    pages.append(page)
    data['base'], data['images'] = get_images(page, html, True)
    info = ox.find_re(html, '<table>.*?</table>')
    for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
        key, value = i.split('</td><td>')
        data[ox.strip_tags(key)] = ox.strip_tags(value)
    links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
    while links:
        for l in links:
            l = 'http://gutenberg.spiegel.de' + l
            html = ox.cache.read_url(l)
            links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
            page = get_page(l)
            pages.append(page)
            data['images'] += get_images(page, html)
    data['pages'] = pages
    return data
Ejemplo n.º 9
0
def get_url(id=None, imdb=None):
    if imdb:
        url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
        data = read_url(url)
        metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
        return metacritic_url or None
    return 'http://www.metacritic.com/movie/%s' % id
Ejemplo n.º 10
0
def parse_table(html):
    return [
        [
            strip_tags(r).strip().replace('&nbsp;', '')
            for r in x.split('<td width="305">-')
        ]
        for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
    ]
Ejemplo n.º 11
0
def lookup(id):
    logger.debug('lookup %s', id)
    r = {'asin': [id]}
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    logger.debug('%s', url)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
    if r["title"] == 'Error!':
        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
        'date': 'Publication date',
        'edition': 'Edition',
        'binding': 'Binding',
        'volume': 'Volume(s)',
        'pages': 'Pages',
    }
    for key in keys:
        r[key] = find_re(
            data,
            '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key]))
        if r[key] == '--' or not r[key]:
            del r[key]
        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />',
                        ' ').replace('<br /> ', ' ').replace('<br />', ' ')
    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace(
        '._SL160_', '')
    for key in r:
        if isinstance(r[key], str):
            r[key] = decode_html(strip_tags(r[key])).strip()
    if 'author' in r and isinstance(r['author'], str) and r['author']:
        r['author'] = [r['author']]
    else:
        r['author'] = []
    if not r['author'] or r['author'][0].isupper():
        del r['author']
    if r['description'].lower(
    ) == 'Description of this item is not available at this time.'.lower():
        r['description'] = ''
    return r
Ejemplo n.º 12
0
def findISBN(title, author):
    q = '%s %s' % (title, author)
    url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
    data = read_url(url, unicode=True)
    links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
    id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
    data = get_data(id)
    if author in data['authors']:
        return data
    return {}
Ejemplo n.º 13
0
def get_id(mininovaId):
    mininovaId = unicode(mininovaId)
    d = find_re(mininovaId, "/(\d+)")
    if d:
        return d
    mininovaId = mininovaId.split('/')
    if len(mininovaId) == 1:
        return mininovaId[0]
    else:
        return mininovaId[-1]
Ejemplo n.º 14
0
def lookup(id):
    logger.debug('lookup %s', id)
    r = {
        'asin': [id]
    }
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    logger.debug('%s', url)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
    if r["title"] == 'Error!':
        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
        'date': 'Publication date',
        'edition': 'Edition',
        'binding': 'Binding',
        'volume': 'Volume(s)',
        'pages': 'Pages',
    }
    for key in keys:
        r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
        if r[key] == '--' or not r[key]:
            del r[key]
        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    for key in r:
        if isinstance(r[key], str):
            r[key] = decode_html(strip_tags(r[key])).strip()
    if 'author' in r and isinstance(r['author'], str) and r['author']:
        r['author'] = [r['author']]
    else:
        r['author'] = []
    if not r['author'] or r['author'][0].isupper():
        del r['author']
    if r['description'].lower() == 'Description of this item is not available at this time.'.lower():
        r['description'] = ''
    return r
Ejemplo n.º 15
0
def get_images(page, html, b=False):
    img = []
    base = ''
    if '<img' in page:
        base = ox.find_re(html, '<base href="(.*?)"')
        for url in re.compile('<img.*?src="(.*?)"').findall(page):
            url = base + url
            img.append(url)
    if b:
        return base, img
    return img
Ejemplo n.º 16
0
def get_data(url):
    data = read_url(url)
    r = {}
    r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
    if '(' in r['title']:
        r['year'] = find_re(r['title'], '\((\d*?)\)')
        r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
    r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
    r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace('  ', ' ').replace('  ', ' ')
    if not r['summary']:
        r['summary'] = get_og(data, 'description')

    meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
    meter = filter(lambda m: m[1].isdigit(), meter)
    if meter:
        r['tomatometer'] = meter[0][1]
    r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
    r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
    r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
    poster = get_og(data, 'image')
    if poster and not 'poster_default.gif' in poster:
        r['posters'] = [poster]
    for key in r.keys():
        if not r[key]:
            del r[key]
    return r
Ejemplo n.º 17
0
def get_episode_data(url):
    '''
      prases informatin on tvcom episode pages
      returns dict with title, show, description, score
      example:
        get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
    '''
    data = read_url(url, unicode=True)
    r = {}
    r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
    r['show'] = find_re(data, '<h1>(.*?)</h1>')
    r['title'] =  find_re(data, '<title>.*?: (.*?) - TV.com  </title>')
    #episode score
    r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')

    match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data) 
    if match:
        r['season'] = int(match[0][1])
        r['episode'] = int(match[0][0])
        #'Wednesday September 29, 2004' -> 2004-09-29 
        r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
    return r
Ejemplo n.º 18
0
def get_posters(url, group=True, timeout=-1):
    posters = []
    html = read_url(url, timeout=timeout, unicode=True)
    if url in html:
        if group:
            results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
            for result in results:
                posters += get_posters(result, False)
        results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
        for result in results:
            html = read_url(result, timeout=timeout, unicode=True)
            posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
    return posters
Ejemplo n.º 19
0
def get_movie_poster(imdbId):
    '''
    >>> get_movie_poster('0133093')
    'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'

    >>> get_movie_poster('0994352')
    'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
    '''
    info = ImdbCombined(imdbId)
    if 'posterId' in info:
        url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
        data = read_url(url)
        poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
        return poster
    elif 'series' in info:
        return get_movie_poster(info['series'])
    return ''
Ejemplo n.º 20
0
def find_subtitles(imdb, parts = 1, language = "eng"):
    if len(language) == 2:
        language = langCode2To3(language)
    elif len(language) != 3:
        language = langTo3Code(language)
    url = "http://www.opensubtitles.org/en/search/"
    if language:
        url += "sublanguageid-%s/" % language
    url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
    data = read_url(url)
    if "title>opensubtitles.com - search results</title" in data:
        fd = feedparser.parse(data)
        opensubtitleId = None
        if fd.entries:
            link = fd.entries[0]['links'][0]['href']
            opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
            if opensubtitleId:
                opensubtitleId = opensubtitleId[0]
    else:
        opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
    return opensubtitleId
Ejemplo n.º 21
0
def get_data(id, language='en'):
    if language == 'de':
        url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
    else:
        url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
    html = read_url(url, unicode=True)
    if 'ID does not exist' in html:
        return None
    if 'Willkommen in der Datenbank des Arsenal' in html:
        return None
    data = {}
    data[u'id'] = id
    data[u'url'] = url
    m = re.compile('<h1>(.*?)</h1>').findall(html)
    if m:
        data[u'title'] = m[0]
    m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
    if m:
        data[u'director'] = m[0]

    m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
    if m:
        data[u'image'] = m[0]

    units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
    for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
        if x:
            #data[x[0][0].lower()] = strip_tags(x[0][1])
            key = x[0][0].lower()
            data[key] = x[0][1]
            if key == "forum catalogue pdf":
                data[key] = find_re(data[key], '"(http:.*?)"')
            else:
                data[key] = strip_tags(data[key])
    if "running time (minutes)" in data:
        data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
    for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
        if key in data and data[key].isdigit():
            data[key] = int(data[key])
    return data
Ejemplo n.º 22
0
def get_data(id):
    '''
    >>> get_data('129689')['cast'][1][1]
    u'Marianne'
    >>> get_data('129689')['credits'][0][0]
    u'Jean-Luc Godard'
    >>> get_data('129689')['posters'][0]
    u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
    >>> get_data('129689')['rating']
    u'4.5'
    '''
    if id.startswith('http'):
        id = get_id(id)
    data = {
        "url": get_url(id)
    }
    html = read_url(data["url"], unicode=True)
    data['aka'] = parse_list(html, 'AKA')
    data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
    data['countries'] = parse_list(html, 'countries')
    data['director'] = parse_entry(html, 'directed by')
    data['genres'] = parse_list(html, 'genres')
    data['keywords'] = parse_list(html, 'keywords')
    data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
    data['produced'] = parse_list(html, 'produced by')
    data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
    data['released'] = parse_entry(html, 'released by')
    data['releasedate'] = parse_list(html, 'release date')
    data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
    data['set'] = parse_entry(html, 'set in')
    data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    data['themes'] = parse_list(html, 'themes')
    data['types'] = parse_list(html, 'types')
    data['year'] = find_re(html, '<span class="year">.*?(\d+)')
    #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
    data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
    #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
    #data['cast'] = parse_table(html)
    #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
    #data['credits'] = parse_table(html)
    html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
    data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    return data
Ejemplo n.º 23
0
def parse_list(html, title):
    html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
    r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
    if not r and html:
        r = [strip_tags(html)]
    return r
Ejemplo n.º 24
0
def get_movie_id(title, director='', year='', timeout=-1):
    '''
    >>> get_movie_id('The Matrix')
    u'0133093'

    >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
    u'0060304'

    >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
    u'0060304'

    >>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
    u'0179214'

    >>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
    u'0179214'
    '''
    imdbId = {
        (u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
        (u'Wings', u'Larisa Shepitko'): '0061196',
        (u'The Ascent', u'Larisa Shepitko'): '0075404',
        (u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
        (u'Torment', u'Alf Sj\xf6berg'): '0036914',
        (u'Crisis', u'Ingmar Bergman'): '0038675',
        (u'To Joy', u'Ingmar Bergman'): '0043048',
        (u'Humain, trop humain', u'Louis Malle'): '0071635',
        (u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
        (u'God\u2019s Country', u'Louis Malle'): '0091125',
        (u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
        (u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
        (u'Je tu il elle', u'Chantal Akerman') : '0071690',
        (u'Hotel Monterey', u'Chantal Akerman') : '0068725',
        (u'No Blood Relation', u'Mikio Naruse') : '023261',
        (u'Apart from You', u'Mikio Naruse') : '0024214',
        (u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
        (u'Street Without End', u'Mikio Naruse') : '0025338',
        (u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
        (u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
        (u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
        (u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
        (u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
        (u'Last Holiday', u'Henry Cass') : '0042665',
        (u'A Colt Is My Passport', u'Takashi  Nomura') : '0330536',
        (u'Androcles and the Lion', u'Chester Erskine') : '0044355',
        (u'Major Barbara', u'Gabriel Pascal') : '0033868',
        (u'Come On Children', u'Allan King') : '0269104',

        (u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
        (u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
        (u'Carmen', u'Carlos Saura'): '0085297',
        (u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
        (u'Weekend', 'Andrew Haigh'): '1714210',
    }.get((title, director), None)
    if imdbId:
        return imdbId
    params = {'s':'tt','q': title}
    if director:
        params['q'] = u'"%s" %s' % (title, director)
    if year:
        params['q'] = u'"%s (%s)" %s' % (title, year, director)
    google_query = "site:imdb.com %s" % params['q']
    if isinstance(params['q'], unicode):
        try:
            params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
        except:
            params['q'] = params['q'].encode('utf-8')
    params = urllib.urlencode(params)
    url = "http://akas.imdb.com/find?" + params
    #print url

    data = read_url(url, timeout=timeout, unicode=True)
    #if search results in redirect, get id of current page
    r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
    results = re.compile(r).findall(data)    
    if results:
        return results[0]
    #otherwise get first result
    r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
    results = re.compile(r).findall(data)
    if results:
        return results[0]

    #print (title, director), ": '',"
    #print google_query
    #results = google.find(google_query, timeout=timeout)
    results = duckduckgo.find(google_query, timeout=timeout)
    if results:
        for r in results[:2]:
            imdbId = find_re(r[1], 'title/tt(\d{7})')
            if imdbId:
                return imdbId
    #or nothing
    return ''
Ejemplo n.º 25
0
def parse_text(html, title):
    return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
Ejemplo n.º 26
0
def amazon_lookup(asin):
    url = 'http://www.amazon.com/dp/%s' % asin
    html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
    return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
Ejemplo n.º 27
0
def get_data(id):
    base = 'http://www.istockphoto.com'
    url = base + '/stock-photo-%s.php' % id
    id = find_re(id, '\d+')
    data = ox.cache.read_url(url, timeout=-1)
    info = {}
    info['title'] = ox.find_re(data, '<title>(.*?) \|')
    info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
    info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
    info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
    info['collections'] = filter(lambda x: x.strip(), info['collections'])
    info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
    info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
    info['keywords'].sort()
    info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
    info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+)&nbsp;</span>')
    info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
    info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
    info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
    info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
    return info
Ejemplo n.º 28
0
def amazon_lookup(asin):
    url = 'http://www.amazon.com/dp/%s' % asin
    html = read_url(url, timeout=-1).decode('utf-8', 'ignore')
    return list(set(find_isbns(find_re(html, 'Formats</h3>.*?</table'))))
Ejemplo n.º 29
0
 def find_data(key):
     return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
Ejemplo n.º 30
0
def info(key, value):
    if key not in ('isbn',):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split('; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0,0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info
Ejemplo n.º 31
0
def info(key, value):
    if key not in ('isbn', ):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(
        decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(
        decode_html(
            unquote(
                re.compile('encodedDescription\' : "(.*?)",').findall(data)
                [0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1],
                                   '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split(
                '; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [
                x.strip() for x in span.text_content().strip().split('\n')
                if x.strip()
            ]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0, 0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info
Ejemplo n.º 32
0
    def __init__(self, id, timeout=-1):
        #use akas.imdb.com to always get original title:
        #http://www.imdb.com/help/show_leaf?titlelanguagedisplay
        self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
        super(Imdb, self).__init__(timeout)
       
        url = self.baseUrl + 'combined' 
        page = self.read_url(url, timeout=-1)
        if '<title>IMDb: Page not found</title>' in page \
            or 'The requested URL was not found on our server.' in page:
            return
        if "<p>We're sorry, something went wrong.</p>" in page:
            time.sleep(1)
            super(Imdb, self).__init__(0)

        if 'alternativeTitles' in self:
            if len(self['alternativeTitles']) == 2 and \
               isinstance(self['alternativeTitles'][0], basestring):
               self['alternativeTitles'] = [self['alternativeTitles']]

        #normalize country names
        if 'country' in self:
            self['country'] = [normalize_country_name(c) or c for c in self['country']]

        if 'sound' in self:
            self['sound'] = list(set(self['sound']))

        types = {}
        stop_words = [ 
            'alternative spelling',
            'alternative title',
            'alternative transliteration',
            'closing credits title',
            'complete title',
            'IMAX version',
            'informal short title',
            'International (Spanish title)',
            'Japan (imdb display title)',
            'longer version',
            'new title',
            'original subtitled version',
            'pre-release title',
            'promotional abbreviation',
            'recut version',
            'reissue title',
            'restored version',
            'script title',
            'short title',
            '(subtitle)',
            'TV title',
            'working title',
            'World-wide (Spanish title)',
        ]
        #ignore english japanese titles
        #for movies that are not only from japan
        if ['Japan'] != self.get('country', []):
            stop_words += [
                'Japan (English title)'
            ]
        for t in self.get('alternativeTitles', []):
            for type in t[0].split('/'):
                type = type.strip()
                stop_word = False
                for key in stop_words:
                    if key in type:
                        stop_word = True
                        break
                if not stop_word:
                    if not type in types:
                        types[type] = []
                    types[type].append(t[1])
        titles = {}
        for type in types:
            for title in types[type]:
                if not title in titles:
                    titles[title] = []
                titles[title].append(type)
        def select_title(type):
            title = types[type][0]
            count = 0
            if len(types[type]) > 1:
                for t in types[type]:
                    if len(titles[t]) > count:
                        count = len(titles[t])
                        title = t
            return title

        #FIXME: does work in python2.6, possible to import from __future__?
        #types = {type: select_title(type) for type in types}
        _types = {}
        for type in types:
            _types[type] = select_title(type)
        types = _types

        regexps = [
            "^.+ \(imdb display title\) \(English title\)$",
            "^USA \(imdb display title\)$",
            "^International \(English title\)$",
            "^International \(English title\)$",
            "^UK \(imdb display title\)$",
            "^International \(.+\) \(English title\)$",
            "^World-wide \(English title\)$",
        ]
        if 'Hong Kong' in self.get('country', []):
            regexps += [
                "Hong Kong \(English title\)"
            ]
        english_countries = (
            'USA', 'UK', 'United States', 'United Kingdom',
            'Australia', 'New Zealand'
        )
        if not filter(lambda c: c in english_countries, self.get('country', [])):
            regexps += [
                "^[^(]+ \(English title\)$",
                "^.+ \(.+\) \(English title\)$",
                "^USA$",
                "^UK$",
                "^USA \(.+\)$",
                "^UK \(.+\)$",
                "^Australia \(.+\)$",
                "World-wide \(English title\)",
                "\(literal English title\)",
                "^International \(.+ title\)$",
                "^International \(.+\) \(.+ title\)$",
            ]
        for regexp in regexps:
            for type in types:
                if re.compile(regexp).findall(type):
                    #print types[type], type
                    self['internationalTitle'] = types[type]
                    break
            if 'internationalTitle' in self:
                break

        def cleanup_title(title):
            if title.startswith('"') and title.endswith('"'):
                title = title[1:-1]
            if title.startswith("'") and title.endswith("'"):
                title = title[1:-1]
            title = re.sub('\(\#[.\d]+\)', '', title)
            return title.strip()

        for t in ('title', 'internationalTitle'):
            if t in self:
                self[t] = cleanup_title(self[t])

        if 'internationalTitle' in self and \
            self.get('title', '').lower() == self['internationalTitle'].lower():
            del self['internationalTitle']

        if 'alternativeTitles' in self:
            alt = {}
            for t in self['alternativeTitles']:
                title = cleanup_title(t[1])
                if title not in (self.get('title'), self.get('internationalTitle')):
                    if title not in alt:
                        alt[title] = []
                    for c in t[0].split('/'):
                        if not '(working title)' in c:
                            c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
                            if c:
                                alt[title].append(c)
            self['alternativeTitles'] = []
            for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
                if alt[t]:
                    countries = sorted([normalize_country_name(c) or c for c in alt[t]])
                    self['alternativeTitles'].append((t, countries))
            if not self['alternativeTitles']:
                del self['alternativeTitles']

        if 'internationalTitle' in self:
            self['originalTitle'] = self['title']
            self['title'] = self.pop('internationalTitle')

        if 'runtime' in self and self['runtime']:
            if 'min' in self['runtime']: base=60
            else: base=1
            self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
        if 'runtime' in self and not self['runtime']:
            del self['runtime']
        if 'votes' in self: self['votes'] = self['votes'].replace(',', '')

        if 'cast' in self:
            if isinstance(self['cast'][0], basestring):
                self['cast'] = [self['cast']]
            self['actor'] = [c[0] for c in self['cast']]
            def cleanup_character(c):
                c = c.replace('(uncredited)', '').strip()
                return c
            self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
                            for x in self['cast']]

        if 'connections' in self:
            cc={}
            if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
                self['connections'] = [self['connections']]
            for rel, data, _ in self['connections']:
                #cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
                def get_conn(c):
                    r = {
                        'id': c[0],
                        'title': cleanup_title(c[1]),
                    }
                    description = c[2].split('<br />')
                    if len(description) == 2 and description[-1].strip() != '-':
                        r['description'] = description[-1].strip()
                    return r
                cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))


            self['connections'] = cc

        for key in ('country', 'genre'):
            if key in self:
                self[key] = filter(lambda x: x.lower() != 'home', self[key])
        #0092999
        if '_director' in self:
            if 'series' in self or 'isSeries' in self:
                self['creator'] = self.pop('_director')
            else:
                del self['_director']
        if 'isSeries' in self:
            del self['isSeries']
            self['isSeries'] = True
        if 'episodeTitle' in self:
            self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])

        if 'series' in self:
            series = Imdb(self['series'], timeout=timeout)
            self['seriesTitle'] = series['title']
            if 'episodeTitle' in self:
                self['seriesTitle'] = series['title']
                if 'season' in self and 'episode' in self:
                    self['title'] = "%s (S%02dE%02d) %s" % (
                        self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
                else:
                    self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
                    self['season'] = 1
                self['title'] = self['title'].strip()
            if 'director' in self:
                self['episodeDirector'] = self['director']

            if not 'creator' in series and 'director' in series:
                series['creator'] = series['director']
                if len(series['creator']) > 10:
                    series['creator'] = series['director'][:1]

            for key in ['creator', 'country']:
                if key in series:
                    self[key] = series[key]

            if 'year' in series:
                self['seriesYear'] = series['year']
                if not 'year' in self:
                    self['year'] = series['year']

            if 'year' in self:
                self['episodeYear'] = self['year']
            if 'creator' in self:
                self['seriesDirector'] = self['creator']
            if 'originalTitle' in self:
                del self['originalTitle']
        else:
            for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
                if key in self:
                    del self[key]
        if 'creator' in self:
            if 'director' in self:
                self['episodeDirector'] = self['director']
            self['director'] = self['creator']

        #make lists unique but keep order
        for key in ('director', 'language'):
            if key in self:
                self[key] = [x for i,x in enumerate(self[key])
                             if x not in self[key][i+1:]]

        for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
            if key in self:
                if isinstance(self[key][0], list):
                    self[key] = [i[0] for i in self[key] if i]
                self[key] = sorted(list(set(self[key])),
                                   lambda a, b: self[key].index(a) - self[key].index(b))

        if 'budget' in self and 'gross' in self:
            self['profit'] = self['gross'] - self['budget']

        if 'releasedate' in self:
            def parse_date(d):
                try:
                    d = datetime.strptime(d, '%d %B %Y')
                except:
                    try:
                        d = datetime.strptime(d, '%B %Y')
                    except:
                        return 'x'
                return '%d-%02d-%02d' % (d.year, d.month, d.day)
            self['releasedate'] = min([
                parse_date(d) for d in self['releasedate']
            ])
            if self['releasedate'] == 'x':
                del self['releasedate']
        if 'summary' in self:
            if isinstance(self['summary'], list):
                self['summary'] = self['summary'][0]
            self['summary'] = self['summary'].split('</p')[0].strip()
Ejemplo n.º 33
0
def parse_entry(html, title):
    html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
    return strip_tags(html).strip()