Ejemplo n.º 1
0
Archivo: meta.py Proyecto: vasc/couchee
def imdb_direct_info(imdb_id):
    print imdb_id

    try:
        html = urllib2.urlopen('http://www.imdb.com/title/'+imdb_id+'/').read()
    except urllib2.HTTPError as e:
        print e
        return

    html = re.sub(r'&#x(\w+);', r'%\1', html)

    try:
        html = unicode(html, 'utf-8')
    except UnicodeDecodeError as e:
        print "UnicodeDecodeError (%s) in %s" % (e, m)
        return

    html = urllib2.unquote(html)
    xml = html2xml.translate(html)
    page = xmlquery.parse_xml(xml)

    info = page.queryone('#overview-top')
    if not info: return

    name = str(info.queryone('h1.header').children[0])
    print name

    year = info.queryone('h1.header>span>a')
    if year: year = int(year.text)
    print year

    genres = map(lambda x: x.text, info.query('.infobar>a[href^="/genre/"]'))
    print genres

    rating = str(info.queryone('span[class="rating-rating"]').children[0])
    if rating == '-': rating = 0
    else: rating = float(rating)
    print rating

    votes = int(re.sub(',', '', info.queryone('a[href="ratings"]').text.split()[0]))
    print votes

    if len(info.query('p')) > 1:
        short_plot = info.query('p')[1].text.splitlines()[0]
    else:
        short_plot = None
    print short_plot

    people = {}
    for p_type in info.query('div.txt-block'):
        p_type_text = p_type.queryone('h4.inline').text[:-1]
        people[p_type_text] = []
        for p in p_type.query('a[href^="/name/"]'):
            people[p_type_text].append((p.text, p.attributes['href']))
    print people
    return

    raise NotImplementedError()

    (name, year) = re.search('<meta name="title" content="(.*?)\s\((\d{4})(?:/[IVX]+)?\)(?:\s\(T?V\))?">', html).group(1, 2)
    m = re.search('<a href="ratings" class="tn15more">(\d+(?:,\d+)?) votes</a>', html)

    if m: mvotes = int(re.sub(',', '', m.group(1)))
    else: mvotes = 0

    print name, year, mvotes
Ejemplo n.º 2
0
import xmlquery

xml = xmlquery.parse_xml("<a><b /></a>")
c = xml.query("*")
#c[1].name = "banana"

for n in c:
    print n.name