Python strip_tagsの例、ox.strip_tags Pythonの例

コード例 #1

0

ファイルを表示

ファイル: thepiratebay.py プロジェクト: adityamangla/metaStudio

def get_data(piratebayId):
    _key_map = {
      'spoken language(s)': u'language',
      'texted language(s)': u'subtitle language',
      'by': u'uploader',
      'leechers': 'leecher',
      'seeders': 'seeder',
    }
    piratebayId = get_id(piratebayId)
    torrent = dict()
    torrent[u'id'] = piratebayId
    torrent[u'domain'] = 'thepiratebay.org'
    torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId

    data = read_url(torrent['comment_link'], unicode=True)
    torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
    if not torrent[u'title']:
        return None
    torrent[u'title'] = decode_html(torrent[u'title']).strip()
    torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
    title = quote(torrent['title'].encode('utf-8'))
    torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
    for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
        key = d[0].lower().strip()
        key = _key_map.get(key, key)
        value = decode_html(strip_tags(d[1].strip()))
        torrent[key] = value
    torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
    if torrent[u'description']:
        torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
    t = read_url(torrent[u'torrent_link'])
    torrent[u'torrent_info'] = get_torrent_info(t)
    return torrent

コード例 #2

0

ファイルを表示

ファイル: rottentomatoes.py プロジェクト: adityamangla/metaStudio

def get_data(url):
    data = read_url(url)
    r = {}
    r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
    if '(' in r['title']:
        r['year'] = find_re(r['title'], '\((\d*?)\)')
        r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
    r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
    r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace('  ', ' ').replace('  ', ' ')
    if not r['summary']:
        r['summary'] = get_og(data, 'description')

    meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
    meter = filter(lambda m: m[1].isdigit(), meter)
    if meter:
        r['tomatometer'] = meter[0][1]
    r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
    r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
    r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
    poster = get_og(data, 'image')
    if poster and not 'poster_default.gif' in poster:
        r['posters'] = [poster]
    for key in r.keys():
        if not r[key]:
            del r[key]
    return r

コード例 #3

0

ファイルを表示

ファイル: mininova.py プロジェクト: adityamangla/metaStudio

def get_data(mininovaId):
    _key_map = {
        'by': u'uploader',
    }
    mininovaId = get_id(mininovaId)
    torrent = dict()
    torrent[u'id'] = mininovaId
    torrent[u'domain'] = 'mininova.org'
    torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
    torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
    torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId

    data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
    if '<h1>Torrent not found...</h1>' in data:
        return None

    for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
        key = d[0].lower().strip()
        key = _key_map.get(key, key)
        value = decode_html(strip_tags(d[1].strip()))
        torrent[key] = value

    torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
    torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
    torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
    if torrent['description']:
        torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
    t = read_url(torrent[u'torrent_link'])
    torrent[u'torrent_info'] = get_torrent_info(t)
    return torrent

コード例 #4

0

ファイルを表示

ファイル: gutenbergde.py プロジェクト: adityamangla/metaStudio

def get_book(id):
    if isinstance(id, basestring) and id.startswith('http'):
        url = id
    else:
        url = get_url(id)
    html = ox.cache.read_url(url, unicode=True)
    data = {}
    data['url'] = url
    pages = []
    page = get_page(url)
    pages.append(page)
    data['base'], data['images'] = get_images(page, html, True)
    info = ox.find_re(html, '<table>.*?</table>')
    for i in re.compile('<tr.*?>(.*?)</tr>').findall(info):
        key, value = i.split('</td><td>')
        data[ox.strip_tags(key)] = ox.strip_tags(value)
    links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
    while links:
        for l in links:
            l = 'http://gutenberg.spiegel.de' + l
            html = ox.cache.read_url(l)
            links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html)
            page = get_page(l)
            pages.append(page)
            data['images'] += get_images(page, html)
    data['pages'] = pages
    return data

コード例 #5

0

ファイルを表示

def find(query=None, user=None, timeout=60):
    if user:
        url = "https://twitter.com/" + quote(user)
    else:
        url = "https://twitter.com/search/" + quote(query)
    data = ox.cache.read_url(url, timeout=timeout).decode("utf-8")
    doc = lxml.html.document_fromstring(data)
    tweets = []
    for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
        t = lxml.html.tostring(e)
        text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
        html = lxml.html.tostring(text, encoding="unicode").strip()
        text = ox.decode_html(ox.strip_tags(html)).strip()
        user = re.compile('data-name="(.*?)"').findall(t)[0]
        user = ox.decode_html(ox.strip_tags(user)).strip()
        tweets.append(
            {
                "id": re.compile('data-tweet-id="(\d+)"').findall(t)[0],
                "user-id": re.compile('data-user-id="(\d+)"').findall(t)[0],
                "name": re.compile('data-screen-name="(.*?)"').findall(t)[0],
                "time": datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
                "user": user,
                "text": text,
                "html": html,
            }
        )
    return tweets

コード例 #6

0

ファイルを表示

ファイル: ubu.py プロジェクト: adityamangla/metaStudio

def get_data(url):
    if not url.startswith('http:'):
        url = get_url(url)
    data = read_url(url, unicode=True)
    m = {
        'id': get_id(url),
        'url': url,
        'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
    }
    for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
        if videourl.endswith('.srt'):
            m['srt'] = videourl
        elif not 'video' in m:
            m['video'] = videourl
            m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
            if m['video'] == 'http://ubumexico.centro.org.mx/video/':
                del m['video']
            m['title'] = strip_tags(decode_html(title)).strip()
    if not 'url' in m:
        print url, 'missing'
    if 'title' in m:
        m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])

    match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
    if match:
        m['flv'] = match[0]
        m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')

    y = re.compile('\((\d{4})\)').findall(data)
    if y:
        m['year'] = int(y[0])
    d = re.compile('Director: (.+)').findall(data)
    if d:
        m['director'] = strip_tags(decode_html(d[0])).strip()

    a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
    if a:
        m['artist'] = strip_tags(decode_html(a[0][1])).strip()
    else:
        a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
        if a:
            m['artist'] = strip_tags(decode_html(a[0][1])).strip()
        else:
            a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
            if a:
                m['artist'] = strip_tags(decode_html(a[0])).strip()
            elif m['id'] == 'film/lawder_color':
                m['artist'] = 'Standish Lawder'
    if 'artist' in m:
        m['artist'] = m['artist'].replace('in UbuWeb Film', '')
        m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
    if m['id'] == 'film/coulibeuf':
        m['title'] = 'Balkan Baroque'
        m['year'] = 1999
    return m

コード例 #7

0

ファイルを表示

ファイル: duckduckgo.py プロジェクト: adityamangla/metaStudio

def find(query, timeout=ox.cache.cache_timeout):
    if isinstance(query, unicode):
        query = query.encode('utf-8')
    params = urllib.urlencode({'q': query})
    url = 'http://duckduckgo.com/html/?' + params
    data = read_url(url, timeout=timeout).decode('utf-8')
    results = []
    regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
    for r in re.compile(regex, re.DOTALL).findall(data):
        results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
    return results

コード例 #8

0

ファイルを表示

ファイル: amazon.py プロジェクト: adityamangla/metaStudio

def get_data(id):
    url = "http://www.amazon.com/title/dp/%s/" % id
    data = read_url(url, unicode=True)


    def find_data(key):
        return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()

    r = {}
    r['amazon'] = url
    r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
    r['authors'] = []
    doc = lxml.html.document_fromstring(data)
    for e in doc.xpath("//span[contains(@class, 'author')]"):
        print e
        for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
            if 'Author' in secondary.text:
                author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
                if author:
                    r['authors'].append(author[0].text.strip())
                else:
                    r['authors'].append(e.xpath('.//a')[0].text.strip())
                break
            elif 'Translator' in secondary.text:
                r['translator'] = [e.xpath('.//a')[0].text]
                break
    r['publisher'] = find_data('Publisher')
    r['language'] = find_data('Language')
    r['isbn-10'] = find_data('ISBN-10')
    r['isbn-13'] = find_data('ISBN-13').replace('-', '')
    r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')

    r['pages'] = find_data('Paperback')
    if not r['pages']:
        r['pages'] = find_data('Hardcover')

    r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()

    for e in doc.xpath('//noscript'):
        for c in e.getchildren():
            if c.tag == 'div':
                r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
                break

    r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
    if r['cover']:
        r['cover'] = r['cover'][0].split('._BO2')[0]
        if not r['cover'].endswith('.jpg'):
            r['cover'] = r['cover'] + '.jpg'
        if 'no-image-avail-img' in r['cover']:
            del r['cover']
    else:
        del r['cover']
    return r

コード例 #9

0

ファイルを表示

ファイル: metacritic.py プロジェクト: bjkomer/sentient-sentiment

def get_reviews(url):
    data = read_url(url, unicode=True)
    doc = document_fromstring(data)
    score = doc.xpath('//span[@itemprop="ratingValue"]')
    if score:
        score = int(score[0].text)
    else:
        score = -1
    # NOTE: some reviews may not have authors
    #       one solution is to track by source instead
    sources = [a.text
        for a in doc.xpath('//div[contains(@class, "critic_reviews")]'\
                           '//div[@class="review_content"]'\
                           '//div[@class="source"]//a|//span[@class="no_link"]')]
    reviews = [d.text
        for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[@class="review_body"]')]
    scores = [score_to_int(d.text.strip())
        for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[contains(@class, "metascore_w")]')]
    
    metacritics = []
    for i in range(len(reviews)):
      if scores[i] != -1: # Don't include TBD scores
        metacritics.append({
            'source': sources[i],
            'quote': strip_tags(reviews[i]).strip(),
            'score': scores[i],
        })
        
    return {
        'critics': metacritics,
        'id': get_id(url),
        'score': score,
        'url': url,
    }

コード例 #10

0

ファイルを表示

ファイル: epguides.py プロジェクト: adityamangla/metaStudio

def get_show_data(url):
    data = read_url(url, unicode=True)
    r = {}
    r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
    r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
    r['episodes'] = {}
    #1.   1- 1       1001      7 Aug 05   You Can't Miss the Bear
    for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
        air_date = episode[3].strip()
        #'22 Sep 04' -> 2004-09-22 
        try:
            air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
        except:
            pass
        s = episode[1].split('-')[0].strip()
        e = episode[1].split('-')[-1].strip()
        try:
            r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
                'prod code': episode[2],
                'air date': air_date,
                'url': episode[4],
                'title':episode[5],
            }
        except:
            print "oxweb.epguides failed,", url
    return r

コード例 #11

0

ファイルを表示

ファイル: lookupbyisbn.py プロジェクト: adityamangla/metaStudio

def get_data(isbn):
    r = {}
    url = '%s/Search/Book/%s/1' % (base, isbn)

    data = read_url(url).decode('utf-8')
    m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
    if m:
        ids = m[0].split('/')
        r['isbn'] = ids[-2]
        r['asin'] = ids[-3]
        url = '%s%s' % (base, m[0])
        data = read_url(url).decode('utf-8')
        r["title"] = find_re(data, "<h2>(.*?)</h2>")
        keys = {
            'author': 'Author(s)',
            'publisher': 'Publisher',
            'date': 'Publication date',
            'edition': 'Edition',
            'binding': 'Binding',
            'volume': 'Volume(s)',
            'pages': 'Pages',
        }
        for key in keys:
            r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
            if r[key] == '--':
                r[key] = ''
            if key == 'pages' and r[key]:
                r[key] = int(r[key])
        desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
        desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
        r['description'] = strip_tags(desc).strip()
        if r['description'] == u'Description of this item is not available at this time.':
            r['description'] = ''
        r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    return r

コード例 #12

0

ファイルを表示

ファイル: allmovie.py プロジェクト: adityamangla/metaStudio

def parse_table(html):
    return [
        [
            strip_tags(r).strip().replace('&nbsp;', '')
            for r in x.split('<td width="305">-')
        ]
        for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
    ]

コード例 #13

0

ファイルを表示

def lookup(id):
    logger.debug('lookup %s', id)
    r = {'asin': [id]}
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    logger.debug('%s', url)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
    if r["title"] == 'Error!':
        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
        'date': 'Publication date',
        'edition': 'Edition',
        'binding': 'Binding',
        'volume': 'Volume(s)',
        'pages': 'Pages',
    }
    for key in keys:
        r[key] = find_re(
            data,
            '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key]))
        if r[key] == '--' or not r[key]:
            del r[key]
        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />',
                        ' ').replace('<br /> ', ' ').replace('<br />', ' ')
    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace(
        '._SL160_', '')
    for key in r:
        if isinstance(r[key], str):
            r[key] = decode_html(strip_tags(r[key])).strip()
    if 'author' in r and isinstance(r['author'], str) and r['author']:
        r['author'] = [r['author']]
    else:
        r['author'] = []
    if not r['author'] or r['author'][0].isupper():
        del r['author']
    if r['description'].lower(
    ) == 'Description of this item is not available at this time.'.lower():
        r['description'] = ''
    return r

コード例 #14

0

ファイルを表示

ファイル: opensubtitles.py プロジェクト: adityamangla/metaStudio

def download_subtitle(opensubtitle_id):
    srts = {}
    data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
    reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
    for f in re.compile(reg_exp, re.DOTALL).findall(data):
        name = strip_tags(f[1]).split('\n')[0]
        url = "http://www.opensubtitles.com%s" % f[0]
        srts[name] = read_url(url, unicode=True)
    return srts

コード例 #15

0

ファイルを表示

ファイル: allmovie.py プロジェクト: adityamangla/metaStudio

def get_data(id):
    '''
    >>> get_data('129689')['cast'][1][1]
    u'Marianne'
    >>> get_data('129689')['credits'][0][0]
    u'Jean-Luc Godard'
    >>> get_data('129689')['posters'][0]
    u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
    >>> get_data('129689')['rating']
    u'4.5'
    '''
    if id.startswith('http'):
        id = get_id(id)
    data = {
        "url": get_url(id)
    }
    html = read_url(data["url"], unicode=True)
    data['aka'] = parse_list(html, 'AKA')
    data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
    data['countries'] = parse_list(html, 'countries')
    data['director'] = parse_entry(html, 'directed by')
    data['genres'] = parse_list(html, 'genres')
    data['keywords'] = parse_list(html, 'keywords')
    data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
    data['produced'] = parse_list(html, 'produced by')
    data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
    data['released'] = parse_entry(html, 'released by')
    data['releasedate'] = parse_list(html, 'release date')
    data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
    data['set'] = parse_entry(html, 'set in')
    data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    data['themes'] = parse_list(html, 'themes')
    data['types'] = parse_list(html, 'types')
    data['year'] = find_re(html, '<span class="year">.*?(\d+)')
    #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
    data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
    #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
    #data['cast'] = parse_table(html)
    #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
    #data['credits'] = parse_table(html)
    html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
    data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
    return data

コード例 #16

0

ファイルを表示

ファイル: lookupbyisbn.py プロジェクト: h4ck3rm1k3/openmedialibrary

def lookup(id):
    logger.debug('lookup %s', id)
    r = {
        'asin': [id]
    }
    url = '%s/Lookup/Book/%s/%s/1' % (base, id, id)
    logger.debug('%s', url)
    data = read_url(url).decode('utf-8')
    r["title"] = find_re(data, "<h2>(.*?)</h2>")
    if r["title"] == 'Error!':
        return {}
    keys = {
        'author': 'Author(s)',
        'publisher': 'Publisher',
        'date': 'Publication date',
        'edition': 'Edition',
        'binding': 'Binding',
        'volume': 'Volume(s)',
        'pages': 'Pages',
    }
    for key in keys:
        r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
        if r[key] == '--' or not r[key]:
            del r[key]
        if key == 'pages' and key in r:
            r[key] = int(r[key])
    desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
    desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
    r['description'] = decode_html(strip_tags(desc))
    r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
    for key in r:
        if isinstance(r[key], str):
            r[key] = decode_html(strip_tags(r[key])).strip()
    if 'author' in r and isinstance(r['author'], str) and r['author']:
        r['author'] = [r['author']]
    else:
        r['author'] = []
    if not r['author'] or r['author'][0].isupper():
        del r['author']
    if r['description'].lower() == 'Description of this item is not available at this time.'.lower():
        r['description'] = ''
    return r

コード例 #17

0

ファイルを表示

ファイル: istockphoto.py プロジェクト: adityamangla/metaStudio

def get_data(id):
    base = 'http://www.istockphoto.com'
    url = base + '/stock-photo-%s.php' % id
    id = find_re(id, '\d+')
    data = ox.cache.read_url(url, timeout=-1)
    info = {}
    info['title'] = ox.find_re(data, '<title>(.*?) \|')
    info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id)
    info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>')
    info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ')
    info['collections'] = filter(lambda x: x.strip(), info['collections'])
    info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', '))
    info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ')
    info['keywords'].sort()
    info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)')
    info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+)&nbsp;</span>')
    info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"')
    info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br'))
    info['description'] = info['description'].split('CLICK TO SEE')[0].strip()
    info['similar'] = re.compile('size=1\&id=(\d+)').findall(data)
    return info

コード例 #18

0

ファイルを表示

ファイル: google.py プロジェクト: adityamangla/metaStudio

def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
    """
    Return max_results tuples with title, url, description 

    >>> find("The Matrix site:imdb.com", 1)[0][0]
    u'The Matrix (1999) - IMDb'

    >>> find("The Matrix site:imdb.com", 1)[0][1]
    u'http://www.imdb.com/title/tt0133093/'
    """
    results = []
    offset = 0
    while len(results) < max_results:
        url = 'http://google.com/search?q=%s' % quote_plus(query)
        if offset:
            url += '&start=%d' % offset
        data = read_url(url, timeout=timeout)
        data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
        for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
            results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
            if len(results) >= max_results:
                break
        offset += 10
    return results

コード例 #19

0

ファイルを表示

def info(epub):
    data = {}
    try:
        z = zipfile.ZipFile(epub)
    except zipfile.BadZipFile:
        logger.debug('invalid epub file %s', epub)
        return data
    opf = [f.filename for f in z.filelist if f.filename.endswith('opf')]
    if opf:
        info = ET.fromstring(z.read(opf[0]))
        metadata = info.findall('{http://www.idpf.org/2007/opf}metadata')
        if metadata:
            metadata = metadata[0]
            for e in metadata.getchildren():
                if e.text and e.text.strip() and e.text not in ('unknown', 'none'):
                    key = e.tag.split('}')[-1]
                    key = {
                        'creator': 'author',
                    }.get(key, key)
                    value = e.text.strip()
                    if key == 'identifier':
                        value = normalize_isbn(value)
                        if stdnum.isbn.is_valid(value):
                            data['isbn'] = [value]
                    elif key == 'author':
                        data[key] = value.split(', ')
                    else:
                        data[key] = value
    if 'description' in data:
        data['description'] = strip_tags(decode_html(data['description']))
    text = extract_text(epub)
    data['textsize'] = len(text)
    if not 'isbn' in data:
        isbn = extract_isbn(text)
        if isbn:
            data['isbn'] = [isbn]
    if 'date' in data and 'T' in data['date']:
        data['date'] = data['date'].split('T')[0]
    if 'language' in data and isinstance(data['language'], str):
        data['language'] = get_language(data['language'])
    return data

コード例 #20

0

ファイルを表示

    def save(self, *args, **kwargs):
        set_public_id = not self.id or not self.public_id
        layer = self.get_layer()
        if self.value:
            self.value = utils.cleanup_value(self.value, layer['type'])
            self.findvalue = ox.decode_html(ox.strip_tags(re.sub('<br */?>\n?', ' ', self.value))).replace('\n', ' ')
            self.findvalue = unicodedata.normalize('NFKD', self.findvalue).lower()
            sortvalue = sort_string(self.findvalue)
            if sortvalue:
                self.sortvalue = sortvalue[:900]
            else:
                self.sortvalue = None
        else:
            self.findvalue = None
            self.sortvalue = None

        #no clip or update clip
        if self.layer in settings.CONFIG.get('clipLayers', []):
            if not self.clip or self.start != self.clip.start or self.end != self.clip.end:
                self.clip, created = Clip.get_or_create(self.item, self.start, self.end)
        elif self.clip:
            self.clip = None

        super(Annotation, self).save(*args, **kwargs)
        if set_public_id:
            self.set_public_id()

        if self.clip:
            Clip.objects.filter(**{
                'id': self.clip.id,
                self.layer: False
            }).update(**{self.layer: True})
            #update clip.findvalue
            self.clip.save()

        #editAnnotations needs to be in snyc
        if layer.get('type') == 'place' or layer.get('hasPlaces'):
            update_matches(self.id, 'place')
        if layer.get('type') == 'event' or layer.get('hasEvents'):
            update_matches(self.id, 'event')

コード例 #21

0

ファイルを表示

ファイル: arsenalberlin.py プロジェクト: adityamangla/metaStudio

def get_data(id, language='en'):
    if language == 'de':
        url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
    else:
        url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
    html = read_url(url, unicode=True)
    if 'ID does not exist' in html:
        return None
    if 'Willkommen in der Datenbank des Arsenal' in html:
        return None
    data = {}
    data[u'id'] = id
    data[u'url'] = url
    m = re.compile('<h1>(.*?)</h1>').findall(html)
    if m:
        data[u'title'] = m[0]
    m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
    if m:
        data[u'director'] = m[0]

    m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
    if m:
        data[u'image'] = m[0]

    units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
    for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
        if x:
            #data[x[0][0].lower()] = strip_tags(x[0][1])
            key = x[0][0].lower()
            data[key] = x[0][1]
            if key == "forum catalogue pdf":
                data[key] = find_re(data[key], '"(http:.*?)"')
            else:
                data[key] = strip_tags(data[key])
    if "running time (minutes)" in data:
        data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
    for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
        if key in data and data[key].isdigit():
            data[key] = int(data[key])
    return data

コード例 #22

0

ファイルを表示

ファイル: tv.py プロジェクト: adityamangla/metaStudio

def get_episode_data(url):
    '''
      prases informatin on tvcom episode pages
      returns dict with title, show, description, score
      example:
        get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
    '''
    data = read_url(url, unicode=True)
    r = {}
    r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
    r['show'] = find_re(data, '<h1>(.*?)</h1>')
    r['title'] =  find_re(data, '<title>.*?: (.*?) - TV.com  </title>')
    #episode score
    r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')

    match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data) 
    if match:
        r['season'] = int(match[0][1])
        r['episode'] = int(match[0][0])
        #'Wednesday September 29, 2004' -> 2004-09-29 
        r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
    return r

コード例 #23

0

ファイルを表示

ファイル: opf.py プロジェクト: h4ck3rm1k3/openmedialibrary

def info(opf):
    data = {}
    try:
        with open(opf, 'rb') as fd:
            opf = ET.fromstring(fd.read().decode())
    except:
        logger.debug('failed to load opf %s', opf, exc_info=1)
        return data
    ns = '{http://www.idpf.org/2007/opf}'
    metadata = opf.findall(ns + 'metadata')[0]
    for e in metadata.getchildren():
        if e.text:
            key = e.tag.split('}')[-1]
            key = {
                'creator': 'author',
            }.get(key, key)
            value = e.text
            if key == 'identifier':
                isbn = normalize_isbn(value)
                if stdnum.isbn.is_valid(isbn):
                    if not 'isbn' in data:
                        data['isbn'] = [isbn]
                    else:
                        data['isbn'].append(isbn)
                if e.attrib.get(ns + 'scheme') == 'AMAZON':
                    if not 'asin' in data:
                        data['asin'] = [value]
                    else:
                        data['asin'].append(value)
            else:
                data[key] = strip_tags(e.text)
    #YYY-MM-DD
    if 'date' in data and len(data['date']) > 10:
        data['date'] =data['date'][:10]
    if 'language' in data:
        data['language'] = get_language(data['language'])
    return data

コード例 #24

0

ファイルを表示

ファイル: opf.py プロジェクト: h4ck3rm1k3/openmedialibrary

def info(opf):
    data = {}
    try:
        with open(opf, 'rb') as fd:
            opf = ET.fromstring(fd.read().decode())
    except:
        logger.debug('failed to load opf %s', opf, exc_info=1)
        return data
    ns = '{http://www.idpf.org/2007/opf}'
    metadata = opf.findall(ns + 'metadata')[0]
    for e in metadata.getchildren():
        if e.text:
            key = e.tag.split('}')[-1]
            key = {
                'creator': 'author',
            }.get(key, key)
            value = e.text
            if key == 'identifier':
                isbn = normalize_isbn(value)
                if stdnum.isbn.is_valid(isbn):
                    if not 'isbn' in data:
                        data['isbn'] = [isbn]
                    else:
                        data['isbn'].append(isbn)
                if e.attrib.get(ns + 'scheme') == 'AMAZON':
                    if not 'asin' in data:
                        data['asin'] = [value]
                    else:
                        data['asin'].append(value)
            else:
                data[key] = strip_tags(e.text)
    #YYY-MM-DD
    if 'date' in data and len(data['date']) > 10:
        data['date'] = data['date'][:10]
    if 'language' in data:
        data['language'] = get_language(data['language'])
    return data

コード例 #25

0

ファイルを表示

ファイル: metacritic.py プロジェクト: bjkomer/sentient-sentiment

def get_data(url):
    data = read_url(url, unicode=True)
    doc = document_fromstring(data)
    score = filter(lambda s: s.attrib.get('property') == 'v:average',
                   doc.xpath('//span[@class="score_value"]'))
    if score:
        score = int(score[0].text)
    else:
        score = -1
    authors = [a.text
        for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
    sources = [d.text
        for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
    reviews = [d.text
        for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
    scores = [int(d.text.strip())
        for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
    urls = [a.attrib['href']
        for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]

    metacritics = []
    for i in range(len(authors)):
        metacritics.append({
            'critic': authors[i],
            'url': urls[i],
            'source': sources[i],
            'quote': strip_tags(reviews[i]).strip(),
            'score': scores[i],
        })
        
    return {
        'critics': metacritics,
        'id': get_id(url),
        'score': score,
        'url': url,
    }

コード例 #26

0

ファイルを表示

def info(key, value):
    if key not in ('isbn', ):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(
        decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(
        decode_html(
            unquote(
                re.compile('encodedDescription\' : "(.*?)",').findall(data)
                [0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1],
                                   '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split(
                '; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [
                x.strip() for x in span.text_content().strip().split('\n')
                if x.strip()
            ]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0, 0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info

コード例 #27

0

ファイルを表示

ファイル: views.py プロジェクト: maysara/pandora_image

def item(request, id):
    id = id.split('/')[0]
    template = 'index.html'
    level = settings.CONFIG['capabilities']['canSeeItem']['guest']
    if not request.user.is_anonymous():
        level = request.user.get_profile().level
    qs = models.Item.objects.filter(itemId=id, level__lte=level)
    if qs.count() == 0:
        context = RequestContext(request, {
            'base_url': request.build_absolute_uri('/'),
            'settings': settings
        })
    else:
        item = qs[0]
        template = 'item.html'
        keys = [
            'year',
            'director',
            'topic',
            'summary'
        ]
        data = []
        for key in keys:
            value = item.get(key)
            if value:
                if isinstance(value, list):
                    value = value = u', '.join([unicode(v) for v in value])
                data.append({'key': key.capitalize(), 'value': value})
        clips = []
        clip = {'in': 0, 'annotations': []}
        #logged in users should have javascript. not adding annotations makes load faster
        if request.user.is_anonymous():
            for a in item.annotations.filter(
                layer__in=models.Annotation.public_layers()).order_by('start', 'end', 'sortvalue'):
                if clip['in'] < a.start:
                    if clip['annotations']:
                        clip['annotations'] = '<br />\n'.join(clip['annotations'])
                        clips.append(clip)
                    clip = {'in': a.start, 'annotations': []}
                clip['annotations'].append(a.value)
        ctx = {
            'current_url': request.build_absolute_uri(request.get_full_path()),
            'base_url': request.build_absolute_uri('/'),
            'url': request.build_absolute_uri('/%s' % id),
            'id': id,
            'settings': settings,
            'data': data,
            'clips': clips,
            'icon': settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon' or 'poster',
            'title': ox.decode_html(item.get('title', '')),
            'description': item.get_item_description()
        }
        if not settings.USE_IMDB:
            value = item.get('topic' in keys and 'topic' or 'keywords')
            if isinstance(value, list):
                value = value = ', '.join(value)
            if value:
                ctx['keywords'] = ox.strip_tags(value)

        context = RequestContext(request, ctx)
    return render_to_response(template, context)

コード例 #28

0

ファイルを表示

ファイル: allmovie.py プロジェクト: adityamangla/metaStudio

def parse_list(html, title):
    html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
    r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
    if not r and html:
        r = [strip_tags(html)]
    return r

コード例 #29

0

ファイルを表示

ファイル: amazon.py プロジェクト: h4ck3rm1k3/openmedialibrary

def info(key, value):
    if key not in ('isbn',):
        raise IOError('unknwon key %s' % key)
    if len(value) == 13:
        value = stdnum.isbn.to_isbn10(value)
    if len(value) != 10:
        raise IOError('invalid isbn %s' % value)
    url = 'http://www.amazon.com/dp/' + value
    data = read_url(url).decode()
    doc = lxml.html.document_fromstring(data)
    info = {}
    if '<title>404 - Document Not Found</title>' in data:
        return info
    if 'To discuss automated access to Amazon data please' in data:
        return info
    for l in doc.xpath('//link[@rel="canonical" and @href]'):
        info['asin'] = [l.get('href').rpartition('/')[-1]]
        break
    info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text))
    info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title'])
    info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title'])
    info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0])))
    info['description'] = fix_bad_unicode(info['description'])
    content = doc.xpath('//div[@class="content"]')[0]
    content_info = {}
    for li in content.xpath('.//li'):
        v = li.text_content()
        if ': ' in v:
            k, v = li.text_content().split(': ', 1)
            content_info[k.strip()] = v.strip()
    if 'Language' in content_info:
        info['language'] = content_info['Language']
    if 'Publisher' in content_info:
        if ' (' in content_info['Publisher']:
            info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}')
        info['publisher'] = content_info['Publisher'].split(' (')[0]
        if '; ' in info['publisher']:
            info['publisher'], info['edition'] = info['publisher'].split('; ', 1)

    if 'ISBN-13' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-13'].replace('-', ''))
    if 'ISBN-10' in content_info:
        if not 'isbn' in info: info['isbn'] = []
        info['isbn'].append(content_info['ISBN-10'])

    a = doc.xpath('//span[@class="a-size-medium"]')
    if a:
        for span in a:
            r = span.getchildren()[0].text.strip()
            role = get_role(r)
            if not role in info: info[role] = []
            info[role].append(span.text.strip())
    else:
        for span in doc.xpath('//span[@class="author notFaded"]'):
            author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()]
            role = get_role(author[-1])
            if not role in info: info[role] = []
            info[role].append(author[0])

    covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0]
    covers = json.loads(decode_html(covers))
    last = [0,0]
    for url in covers:
        if covers[url] > last:
            last = covers[url]
            info['cover'] = re.sub('(\._SX.+?_\.)', '.', url)
    return info

コード例 #30

0

ファイルを表示

ファイル: allmovie.py プロジェクト: adityamangla/metaStudio

def parse_entry(html, title):
    html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
    return strip_tags(html).strip()

コード例 #31

0

ファイルを表示

ファイル: views.py プロジェクト: wafaa-yousef88/pandora_image_archive_private

def item(request, id):
    id = id.split('/')[0]
    template = 'index.html'
    level = settings.CONFIG['capabilities']['canSeeItem']['guest']
    if not request.user.is_anonymous():
        level = request.user.get_profile().level
    qs = models.Item.objects.filter(itemId=id, level__lte=level)
    if qs.count() == 0:
        context = RequestContext(request, {
            'base_url': request.build_absolute_uri('/'),
            'settings': settings
        })
    else:
        item = qs[0]
        template = 'item.html'
        keys = ['year', 'director', 'topic', 'summary']
        data = []
        for key in keys:
            value = item.get(key)
            if value:
                if isinstance(value, list):
                    value = value = u', '.join([unicode(v) for v in value])
                data.append({'key': key.capitalize(), 'value': value})
        clips = []
        clip = {'in': 0, 'annotations': []}
        #logged in users should have javascript. not adding annotations makes load faster
        if request.user.is_anonymous():
            for a in item.annotations.filter(
                    layer__in=models.Annotation.public_layers()).order_by(
                        'start', 'end', 'sortvalue'):
                if clip['in'] < a.start:
                    if clip['annotations']:
                        clip['annotations'] = '<br />\n'.join(
                            clip['annotations'])
                        clips.append(clip)
                    clip = {'in': a.start, 'annotations': []}
                clip['annotations'].append(a.value)
        ctx = {
            'current_url':
            request.build_absolute_uri(request.get_full_path()),
            'base_url':
            request.build_absolute_uri('/'),
            'url':
            request.build_absolute_uri('/%s' % id),
            'id':
            id,
            'settings':
            settings,
            'data':
            data,
            'clips':
            clips,
            'icon':
            settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon'
            or 'poster',
            'title':
            ox.decode_html(item.get('title', '')),
            'description':
            item.get_item_description()
        }
        if not settings.USE_IMDB:
            value = item.get('topic' in keys and 'topic' or 'keywords')
            if isinstance(value, list):
                value = value = ', '.join(value)
            if value:
                ctx['keywords'] = ox.strip_tags(value)

        context = RequestContext(request, ctx)
    return render_to_response(template, context)

コード例 #32

0

ファイルを表示

ファイル: marc_countries.py プロジェクト: h4ck3rm1k3/openmedialibrary

    "nw": "Northern Mariana Islands", 
    "wvu": "West Virginia", 
    "-xxr": "Soviet Union", 
    "-tar": "Tajik S.S.R.", 
    "bcc": "British Columbia"
}


if __name__ == '__main__':
    import json
    import re
    import ox
    from ox.cache import read_url

    url = "http://www.loc.gov/marc/countries/countries_code.html"
    data = read_url(url).decode('utf-8')
    countries = dict([
        [ox.strip_tags(c) for c in r]
        for r in re.compile('<tr>.*?class="code">(.*?)</td>.*?<td>(.*?)</td>', re.DOTALL).findall(data)
    ])

    data = json.dumps(countries, indent=4, ensure_ascii=False).encode('utf-8')
    with open(__file__) as f:
        pydata = f.read()
    pydata = re.sub(
        re.compile('\nCOUNTRIES = {.*?}\n\n', re.DOTALL),
        '\nCOUNTRIES = %s\n\n' % data, pydata)

    with open(__file__, 'w') as f:
        f.write(pydata)

コード例 #33

0

ファイルを表示

ファイル: allmovie.py プロジェクト: adityamangla/metaStudio

def parse_text(html, title):
    return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()