def get_data(piratebayId): _key_map = { 'spoken language(s)': u'language', 'texted language(s)': u'subtitle language', 'by': u'uploader', 'leechers': 'leecher', 'seeders': 'seeder', } piratebayId = get_id(piratebayId) torrent = dict() torrent[u'id'] = piratebayId torrent[u'domain'] = 'thepiratebay.org' torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId data = read_url(torrent['comment_link'], unicode=True) torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>') if not torrent[u'title']: return None torrent[u'title'] = decode_html(torrent[u'title']).strip() torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') title = quote(torrent['title'].encode('utf-8')) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) torrent[key] = value torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>') if torrent[u'description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = get_torrent_info(t) return torrent
def get_data(url): data = read_url(url) r = {} r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>') if '(' in r['title']: r['year'] = find_re(r['title'], '\((\d*?)\)') r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip() r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip() r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') if not r['summary']: r['summary'] = get_og(data, 'description') meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data) meter = filter(lambda m: m[1].isdigit(), meter) if meter: r['tomatometer'] = meter[0][1] r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>') r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>') r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5') poster = get_og(data, 'image') if poster and not 'poster_default.gif' in poster: r['posters'] = [poster] for key in r.keys(): if not r[key]: del r[key] return r
def get_data(mininovaId): _key_map = { 'by': u'uploader', } mininovaId = get_id(mininovaId) torrent = dict() torrent[u'id'] = mininovaId torrent[u'domain'] = 'mininova.org' torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True) if '<h1>Torrent not found...</h1>' in data: return None for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) torrent[key] = value torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>') torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>') if torrent['description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = get_torrent_info(t) return torrent
def get_book(id): if isinstance(id, basestring) and id.startswith('http'): url = id else: url = get_url(id) html = ox.cache.read_url(url, unicode=True) data = {} data['url'] = url pages = [] page = get_page(url) pages.append(page) data['base'], data['images'] = get_images(page, html, True) info = ox.find_re(html, '<table>.*?</table>') for i in re.compile('<tr.*?>(.*?)</tr>').findall(info): key, value = i.split('</td><td>') data[ox.strip_tags(key)] = ox.strip_tags(value) links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html) while links: for l in links: l = 'http://gutenberg.spiegel.de' + l html = ox.cache.read_url(l) links = re.compile('<a style="float: right;" href="(/buch/.*?)">').findall(html) page = get_page(l) pages.append(page) data['images'] += get_images(page, html) data['pages'] = pages return data
def find(query=None, user=None, timeout=60): if user: url = "https://twitter.com/" + quote(user) else: url = "https://twitter.com/search/" + quote(query) data = ox.cache.read_url(url, timeout=timeout).decode("utf-8") doc = lxml.html.document_fromstring(data) tweets = [] for e in doc.xpath("//div[contains(@class, 'original-tweet')]"): t = lxml.html.tostring(e) text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0] html = lxml.html.tostring(text, encoding="unicode").strip() text = ox.decode_html(ox.strip_tags(html)).strip() user = re.compile('data-name="(.*?)"').findall(t)[0] user = ox.decode_html(ox.strip_tags(user)).strip() tweets.append( { "id": re.compile('data-tweet-id="(\d+)"').findall(t)[0], "user-id": re.compile('data-user-id="(\d+)"').findall(t)[0], "name": re.compile('data-screen-name="(.*?)"').findall(t)[0], "time": datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])), "user": user, "text": text, "html": html, } ) return tweets
def get_data(url): if not url.startswith('http:'): url = get_url(url) data = read_url(url, unicode=True) m = { 'id': get_id(url), 'url': url, 'type': re.compile('ubu.com/(.*?)/').findall(url)[0] } for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data): if videourl.endswith('.srt'): m['srt'] = videourl elif not 'video' in m: m['video'] = videourl m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20') if m['video'] == 'http://ubumexico.centro.org.mx/video/': del m['video'] m['title'] = strip_tags(decode_html(title)).strip() if not 'url' in m: print url, 'missing' if 'title' in m: m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title']) match = re.compile("flashvars','file=(.*?.flv)'").findall(data) if match: m['flv'] = match[0] m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20') y = re.compile('\((\d{4})\)').findall(data) if y: m['year'] = int(y[0]) d = re.compile('Director: (.+)').findall(data) if d: m['director'] = strip_tags(decode_html(d[0])).strip() a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0])).strip() elif m['id'] == 'film/lawder_color': m['artist'] = 'Standish Lawder' if 'artist' in m: m['artist'] = m['artist'].replace('in UbuWeb Film', '') m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip() if m['id'] == 'film/coulibeuf': m['title'] = 'Balkan Baroque' m['year'] = 1999 return m
def find(query, timeout=ox.cache.cache_timeout): if isinstance(query, unicode): query = query.encode('utf-8') params = urllib.urlencode({'q': query}) url = 'http://duckduckgo.com/html/?' + params data = read_url(url, timeout=timeout).decode('utf-8') results = [] regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>' for r in re.compile(regex, re.DOTALL).findall(data): results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2])))) return results
def get_data(id): url = "http://www.amazon.com/title/dp/%s/" % id data = read_url(url, unicode=True) def find_data(key): return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip() r = {} r['amazon'] = url r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>') r['authors'] = [] doc = lxml.html.document_fromstring(data) for e in doc.xpath("//span[contains(@class, 'author')]"): print e for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"): if 'Author' in secondary.text: author = e.xpath(".//span[contains(@class, 'a-size-medium')]") if author: r['authors'].append(author[0].text.strip()) else: r['authors'].append(e.xpath('.//a')[0].text.strip()) break elif 'Translator' in secondary.text: r['translator'] = [e.xpath('.//a')[0].text] break r['publisher'] = find_data('Publisher') r['language'] = find_data('Language') r['isbn-10'] = find_data('ISBN-10') r['isbn-13'] = find_data('ISBN-13').replace('-', '') r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>') r['pages'] = find_data('Paperback') if not r['pages']: r['pages'] = find_data('Hardcover') r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() for e in doc.xpath('//noscript'): for c in e.getchildren(): if c.tag == 'div': r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip() break r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) if r['cover']: r['cover'] = r['cover'][0].split('._BO2')[0] if not r['cover'].endswith('.jpg'): r['cover'] = r['cover'] + '.jpg' if 'no-image-avail-img' in r['cover']: del r['cover'] else: del r['cover'] return r
def get_reviews(url): data = read_url(url, unicode=True) doc = document_fromstring(data) score = doc.xpath('//span[@itemprop="ratingValue"]') if score: score = int(score[0].text) else: score = -1 # NOTE: some reviews may not have authors # one solution is to track by source instead sources = [a.text for a in doc.xpath('//div[contains(@class, "critic_reviews")]'\ '//div[@class="review_content"]'\ '//div[@class="source"]//a|//span[@class="no_link"]')] reviews = [d.text for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[@class="review_body"]')] scores = [score_to_int(d.text.strip()) for d in doc.xpath('//div[contains(@class, "critic_reviews")]//div[@class="review_content"]//div[contains(@class, "metascore_w")]')] metacritics = [] for i in range(len(reviews)): if scores[i] != -1: # Don't include TBD scores metacritics.append({ 'source': sources[i], 'quote': strip_tags(reviews[i]).strip(), 'score': scores[i], }) return { 'critics': metacritics, 'id': get_id(url), 'score': score, 'url': url, }
def get_show_data(url): data = read_url(url, unicode=True) r = {} r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>')) r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>') r['episodes'] = {} #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data): air_date = episode[3].strip() #'22 Sep 04' -> 2004-09-22 try: air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y')) except: pass s = episode[1].split('-')[0].strip() e = episode[1].split('-')[-1].strip() try: r['episodes']['S%02dE%02d' % (int(s), int(e))] = { 'prod code': episode[2], 'air date': air_date, 'url': episode[4], 'title':episode[5], } except: print "oxweb.epguides failed,", url return r
def get_data(isbn): r = {} url = '%s/Search/Book/%s/1' % (base, isbn) data = read_url(url).decode('utf-8') m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data) if m: ids = m[0].split('/') r['isbn'] = ids[-2] r['asin'] = ids[-3] url = '%s%s' % (base, m[0]) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key])) if r[key] == '--': r[key] = '' if key == 'pages' and r[key]: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = strip_tags(desc).strip() if r['description'] == u'Description of this item is not available at this time.': r['description'] = '' r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '') return r
def parse_table(html): return [ [ strip_tags(r).strip().replace(' ', '') for r in x.split('<td width="305">-') ] for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1] ]
def lookup(id): logger.debug('lookup %s', id) r = {'asin': [id]} url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) logger.debug('%s', url) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") if r["title"] == 'Error!': return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re( data, '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key])) if r[key] == '--' or not r[key]: del r[key] if key == 'pages' and key in r: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = decode_html(strip_tags(desc)) r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace( '._SL160_', '') for key in r: if isinstance(r[key], str): r[key] = decode_html(strip_tags(r[key])).strip() if 'author' in r and isinstance(r['author'], str) and r['author']: r['author'] = [r['author']] else: r['author'] = [] if not r['author'] or r['author'][0].isupper(): del r['author'] if r['description'].lower( ) == 'Description of this item is not available at this time.'.lower(): r['description'] = '' return r
def download_subtitle(opensubtitle_id): srts = {} data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' for f in re.compile(reg_exp, re.DOTALL).findall(data): name = strip_tags(f[1]).split('\n')[0] url = "http://www.opensubtitles.com%s" % f[0] srts[name] = read_url(url, unicode=True) return srts
def get_data(id): ''' >>> get_data('129689')['cast'][1][1] u'Marianne' >>> get_data('129689')['credits'][0][0] u'Jean-Luc Godard' >>> get_data('129689')['posters'][0] u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] u'4.5' ''' if id.startswith('http'): id = get_id(id) data = { "url": get_url(id) } html = read_url(data["url"], unicode=True) data['aka'] = parse_list(html, 'AKA') data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>') data['countries'] = parse_list(html, 'countries') data['director'] = parse_entry(html, 'directed by') data['genres'] = parse_list(html, 'genres') data['keywords'] = parse_list(html, 'keywords') data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')] data['produced'] = parse_list(html, 'produced by') data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"') data['released'] = parse_entry(html, 'released by') data['releasedate'] = parse_list(html, 'release date') data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip() data['set'] = parse_entry(html, 'set in') data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['themes'] = parse_list(html, 'themes') data['types'] = parse_list(html, 'types') data['year'] = find_re(html, '<span class="year">.*?(\d+)') #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True) #data['cast'] = parse_table(html) #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True) #data['credits'] = parse_table(html) html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True) data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() return data
def lookup(id): logger.debug('lookup %s', id) r = { 'asin': [id] } url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) logger.debug('%s', url) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") if r["title"] == 'Error!': return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key])) if r[key] == '--' or not r[key]: del r[key] if key == 'pages' and key in r: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = decode_html(strip_tags(desc)) r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '') for key in r: if isinstance(r[key], str): r[key] = decode_html(strip_tags(r[key])).strip() if 'author' in r and isinstance(r['author'], str) and r['author']: r['author'] = [r['author']] else: r['author'] = [] if not r['author'] or r['author'][0].isupper(): del r['author'] if r['description'].lower() == 'Description of this item is not available at this time.'.lower(): r['description'] = '' return r
def get_data(id): base = 'http://www.istockphoto.com' url = base + '/stock-photo-%s.php' % id id = find_re(id, '\d+') data = ox.cache.read_url(url, timeout=-1) info = {} info['title'] = ox.find_re(data, '<title>(.*?) \|') info['thumbnail'] = base + ox.find_re(data, 'src="(/file_thumbview_approve/%s.*?)"'%id) info['views'] = ox.find_re(data, '<tr><td>Views:</td><td>(\d+)</td>') info['collections'] = strip_tags(ox.find_re(data, '<td>Collections:</td><td>(.*?)</td>')).split(', ') info['collections'] = filter(lambda x: x.strip(), info['collections']) info['keywords'] = map(lambda k: k.strip(), strip_tags(ox.find_re(data, '<td>Keywords:</td>.*?<td>(.*?)\.\.\.<')).split(', ')) info['keywords'] = ox.find_re(data, '<meta name="keywords" content="(.*?), stock image').split(', ') info['keywords'].sort() info['uploaded'] = ox.find_re(data, '<td>Uploaded on:</td>.*?<td>([\d\-]+)') info['downloads'] = ox.find_re(data, '<span class="fl">.*?(\d+) </span>') info['contributor'] = ox.find_re(data, '<td class="m">Contributor:</td>.*?<a href="user_view.php\?id=.*?">.*?alt="(.*?)"') info['description'] = strip_tags(ox.find_re(data, 'artistsDescriptionData = \["(.*?)<br')) info['description'] = info['description'].split('CLICK TO SEE')[0].strip() info['similar'] = re.compile('size=1\&id=(\d+)').findall(data) return info
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description >>> find("The Matrix site:imdb.com", 1)[0][0] u'The Matrix (1999) - IMDb' >>> find("The Matrix site:imdb.com", 1)[0][1] u'http://www.imdb.com/title/tt0133093/' """ results = [] offset = 0 while len(results) < max_results: url = 'http://google.com/search?q=%s' % quote_plus(query) if offset: url += '&start=%d' % offset data = read_url(url, timeout=timeout) data = re.sub('<span class="f">(.*?)</span>', '\\1', data) for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data): results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2])))) if len(results) >= max_results: break offset += 10 return results
def info(epub): data = {} try: z = zipfile.ZipFile(epub) except zipfile.BadZipFile: logger.debug('invalid epub file %s', epub) return data opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] if opf: info = ET.fromstring(z.read(opf[0])) metadata = info.findall('{http://www.idpf.org/2007/opf}metadata') if metadata: metadata = metadata[0] for e in metadata.getchildren(): if e.text and e.text.strip() and e.text not in ('unknown', 'none'): key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text.strip() if key == 'identifier': value = normalize_isbn(value) if stdnum.isbn.is_valid(value): data['isbn'] = [value] elif key == 'author': data[key] = value.split(', ') else: data[key] = value if 'description' in data: data['description'] = strip_tags(decode_html(data['description'])) text = extract_text(epub) data['textsize'] = len(text) if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'date' in data and 'T' in data['date']: data['date'] = data['date'].split('T')[0] if 'language' in data and isinstance(data['language'], str): data['language'] = get_language(data['language']) return data
def save(self, *args, **kwargs): set_public_id = not self.id or not self.public_id layer = self.get_layer() if self.value: self.value = utils.cleanup_value(self.value, layer['type']) self.findvalue = ox.decode_html(ox.strip_tags(re.sub('<br */?>\n?', ' ', self.value))).replace('\n', ' ') self.findvalue = unicodedata.normalize('NFKD', self.findvalue).lower() sortvalue = sort_string(self.findvalue) if sortvalue: self.sortvalue = sortvalue[:900] else: self.sortvalue = None else: self.findvalue = None self.sortvalue = None #no clip or update clip if self.layer in settings.CONFIG.get('clipLayers', []): if not self.clip or self.start != self.clip.start or self.end != self.clip.end: self.clip, created = Clip.get_or_create(self.item, self.start, self.end) elif self.clip: self.clip = None super(Annotation, self).save(*args, **kwargs) if set_public_id: self.set_public_id() if self.clip: Clip.objects.filter(**{ 'id': self.clip.id, self.layer: False }).update(**{self.layer: True}) #update clip.findvalue self.clip.save() #editAnnotations needs to be in snyc if layer.get('type') == 'place' or layer.get('hasPlaces'): update_matches(self.id, 'place') if layer.get('type') == 'event' or layer.get('hasEvents'): update_matches(self.id, 'event')
def get_data(id, language='en'): if language == 'de': url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id else: url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id html = read_url(url, unicode=True) if 'ID does not exist' in html: return None if 'Willkommen in der Datenbank des Arsenal' in html: return None data = {} data[u'id'] = id data[u'url'] = url m = re.compile('<h1>(.*?)</h1>').findall(html) if m: data[u'title'] = m[0] m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html) if m: data[u'director'] = m[0] m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) if m: data[u'image'] = m[0] units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html) for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units): if x: #data[x[0][0].lower()] = strip_tags(x[0][1]) key = x[0][0].lower() data[key] = x[0][1] if key == "forum catalogue pdf": data[key] = find_re(data[key], '"(http:.*?)"') else: data[key] = strip_tags(data[key]) if "running time (minutes)" in data: data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): if key in data and data[key].isdigit(): data[key] = int(data[key]) return data
def get_episode_data(url): ''' prases informatin on tvcom episode pages returns dict with title, show, description, score example: get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') ''' data = read_url(url, unicode=True) r = {} r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0]) r['show'] = find_re(data, '<h1>(.*?)</h1>') r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>') #episode score r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>') match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data) if match: r['season'] = int(match[0][1]) r['episode'] = int(match[0][0]) #'Wednesday September 29, 2004' -> 2004-09-29 r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y')) return r
def info(opf): data = {} try: with open(opf, 'rb') as fd: opf = ET.fromstring(fd.read().decode()) except: logger.debug('failed to load opf %s', opf, exc_info=1) return data ns = '{http://www.idpf.org/2007/opf}' metadata = opf.findall(ns + 'metadata')[0] for e in metadata.getchildren(): if e.text: key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text if key == 'identifier': isbn = normalize_isbn(value) if stdnum.isbn.is_valid(isbn): if not 'isbn' in data: data['isbn'] = [isbn] else: data['isbn'].append(isbn) if e.attrib.get(ns + 'scheme') == 'AMAZON': if not 'asin' in data: data['asin'] = [value] else: data['asin'].append(value) else: data[key] = strip_tags(e.text) #YYY-MM-DD if 'date' in data and len(data['date']) > 10: data['date'] =data['date'][:10] if 'language' in data: data['language'] = get_language(data['language']) return data
def info(opf): data = {} try: with open(opf, 'rb') as fd: opf = ET.fromstring(fd.read().decode()) except: logger.debug('failed to load opf %s', opf, exc_info=1) return data ns = '{http://www.idpf.org/2007/opf}' metadata = opf.findall(ns + 'metadata')[0] for e in metadata.getchildren(): if e.text: key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text if key == 'identifier': isbn = normalize_isbn(value) if stdnum.isbn.is_valid(isbn): if not 'isbn' in data: data['isbn'] = [isbn] else: data['isbn'].append(isbn) if e.attrib.get(ns + 'scheme') == 'AMAZON': if not 'asin' in data: data['asin'] = [value] else: data['asin'].append(value) else: data[key] = strip_tags(e.text) #YYY-MM-DD if 'date' in data and len(data['date']) > 10: data['date'] = data['date'][:10] if 'language' in data: data['language'] = get_language(data['language']) return data
def get_data(url): data = read_url(url, unicode=True) doc = document_fromstring(data) score = filter(lambda s: s.attrib.get('property') == 'v:average', doc.xpath('//span[@class="score_value"]')) if score: score = int(score[0].text) else: score = -1 authors = [a.text for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')] sources = [d.text for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')] reviews = [d.text for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')] scores = [int(d.text.strip()) for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')] urls = [a.attrib['href'] for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')] metacritics = [] for i in range(len(authors)): metacritics.append({ 'critic': authors[i], 'url': urls[i], 'source': sources[i], 'quote': strip_tags(reviews[i]).strip(), 'score': scores[i], }) return { 'critics': metacritics, 'id': get_id(url), 'score': score, 'url': url, }
def info(key, value): if key not in ('isbn', ): raise IOError('unknwon key %s' % key) if len(value) == 13: value = stdnum.isbn.to_isbn10(value) if len(value) != 10: raise IOError('invalid isbn %s' % value) url = 'http://www.amazon.com/dp/' + value data = read_url(url).decode() doc = lxml.html.document_fromstring(data) info = {} if '<title>404 - Document Not Found</title>' in data: return info if 'To discuss automated access to Amazon data please' in data: return info for l in doc.xpath('//link[@rel="canonical" and @href]'): info['asin'] = [l.get('href').rpartition('/')[-1]] break info['title'] = strip_tags( decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['description'] = strip_tags( decode_html( unquote( re.compile('encodedDescription\' : "(.*?)",').findall(data) [0]))) info['description'] = fix_bad_unicode(info['description']) content = doc.xpath('//div[@class="content"]')[0] content_info = {} for li in content.xpath('.//li'): v = li.text_content() if ': ' in v: k, v = li.text_content().split(': ', 1) content_info[k.strip()] = v.strip() if 'Language' in content_info: info['language'] = content_info['Language'] if 'Publisher' in content_info: if ' (' in content_info['Publisher']: info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}') info['publisher'] = content_info['Publisher'].split(' (')[0] if '; ' in info['publisher']: info['publisher'], info['edition'] = info['publisher'].split( '; ', 1) if 'ISBN-13' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-13'].replace('-', '')) if 'ISBN-10' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-10']) a = doc.xpath('//span[@class="a-size-medium"]') if a: for span in a: r = span.getchildren()[0].text.strip() role = get_role(r) if not role in info: info[role] = [] info[role].append(span.text.strip()) else: for span in doc.xpath('//span[@class="author notFaded"]'): author = [ x.strip() for x in span.text_content().strip().split('\n') if x.strip() ] role = get_role(author[-1]) if not role in info: info[role] = [] info[role].append(author[0]) covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = json.loads(decode_html(covers)) last = [0, 0] for url in covers: if covers[url] > last: last = covers[url] info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) return info
def item(request, id): id = id.split('/')[0] template = 'index.html' level = settings.CONFIG['capabilities']['canSeeItem']['guest'] if not request.user.is_anonymous(): level = request.user.get_profile().level qs = models.Item.objects.filter(itemId=id, level__lte=level) if qs.count() == 0: context = RequestContext(request, { 'base_url': request.build_absolute_uri('/'), 'settings': settings }) else: item = qs[0] template = 'item.html' keys = [ 'year', 'director', 'topic', 'summary' ] data = [] for key in keys: value = item.get(key) if value: if isinstance(value, list): value = value = u', '.join([unicode(v) for v in value]) data.append({'key': key.capitalize(), 'value': value}) clips = [] clip = {'in': 0, 'annotations': []} #logged in users should have javascript. not adding annotations makes load faster if request.user.is_anonymous(): for a in item.annotations.filter( layer__in=models.Annotation.public_layers()).order_by('start', 'end', 'sortvalue'): if clip['in'] < a.start: if clip['annotations']: clip['annotations'] = '<br />\n'.join(clip['annotations']) clips.append(clip) clip = {'in': a.start, 'annotations': []} clip['annotations'].append(a.value) ctx = { 'current_url': request.build_absolute_uri(request.get_full_path()), 'base_url': request.build_absolute_uri('/'), 'url': request.build_absolute_uri('/%s' % id), 'id': id, 'settings': settings, 'data': data, 'clips': clips, 'icon': settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon' or 'poster', 'title': ox.decode_html(item.get('title', '')), 'description': item.get_item_description() } if not settings.USE_IMDB: value = item.get('topic' in keys and 'topic' or 'keywords') if isinstance(value, list): value = value = ', '.join(value) if value: ctx['keywords'] = ox.strip_tags(value) context = RequestContext(request, ctx) return render_to_response(template, context)
def parse_list(html, title): html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower()) r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html)) if not r and html: r = [strip_tags(html)] return r
def info(key, value): if key not in ('isbn',): raise IOError('unknwon key %s' % key) if len(value) == 13: value = stdnum.isbn.to_isbn10(value) if len(value) != 10: raise IOError('invalid isbn %s' % value) url = 'http://www.amazon.com/dp/' + value data = read_url(url).decode() doc = lxml.html.document_fromstring(data) info = {} if '<title>404 - Document Not Found</title>' in data: return info if 'To discuss automated access to Amazon data please' in data: return info for l in doc.xpath('//link[@rel="canonical" and @href]'): info['asin'] = [l.get('href').rpartition('/')[-1]] break info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0]))) info['description'] = fix_bad_unicode(info['description']) content = doc.xpath('//div[@class="content"]')[0] content_info = {} for li in content.xpath('.//li'): v = li.text_content() if ': ' in v: k, v = li.text_content().split(': ', 1) content_info[k.strip()] = v.strip() if 'Language' in content_info: info['language'] = content_info['Language'] if 'Publisher' in content_info: if ' (' in content_info['Publisher']: info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}') info['publisher'] = content_info['Publisher'].split(' (')[0] if '; ' in info['publisher']: info['publisher'], info['edition'] = info['publisher'].split('; ', 1) if 'ISBN-13' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-13'].replace('-', '')) if 'ISBN-10' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-10']) a = doc.xpath('//span[@class="a-size-medium"]') if a: for span in a: r = span.getchildren()[0].text.strip() role = get_role(r) if not role in info: info[role] = [] info[role].append(span.text.strip()) else: for span in doc.xpath('//span[@class="author notFaded"]'): author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()] role = get_role(author[-1]) if not role in info: info[role] = [] info[role].append(author[0]) covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = json.loads(decode_html(covers)) last = [0,0] for url in covers: if covers[url] > last: last = covers[url] info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) return info
def parse_entry(html, title): html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title) return strip_tags(html).strip()
def item(request, id): id = id.split('/')[0] template = 'index.html' level = settings.CONFIG['capabilities']['canSeeItem']['guest'] if not request.user.is_anonymous(): level = request.user.get_profile().level qs = models.Item.objects.filter(itemId=id, level__lte=level) if qs.count() == 0: context = RequestContext(request, { 'base_url': request.build_absolute_uri('/'), 'settings': settings }) else: item = qs[0] template = 'item.html' keys = ['year', 'director', 'topic', 'summary'] data = [] for key in keys: value = item.get(key) if value: if isinstance(value, list): value = value = u', '.join([unicode(v) for v in value]) data.append({'key': key.capitalize(), 'value': value}) clips = [] clip = {'in': 0, 'annotations': []} #logged in users should have javascript. not adding annotations makes load faster if request.user.is_anonymous(): for a in item.annotations.filter( layer__in=models.Annotation.public_layers()).order_by( 'start', 'end', 'sortvalue'): if clip['in'] < a.start: if clip['annotations']: clip['annotations'] = '<br />\n'.join( clip['annotations']) clips.append(clip) clip = {'in': a.start, 'annotations': []} clip['annotations'].append(a.value) ctx = { 'current_url': request.build_absolute_uri(request.get_full_path()), 'base_url': request.build_absolute_uri('/'), 'url': request.build_absolute_uri('/%s' % id), 'id': id, 'settings': settings, 'data': data, 'clips': clips, 'icon': settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon' or 'poster', 'title': ox.decode_html(item.get('title', '')), 'description': item.get_item_description() } if not settings.USE_IMDB: value = item.get('topic' in keys and 'topic' or 'keywords') if isinstance(value, list): value = value = ', '.join(value) if value: ctx['keywords'] = ox.strip_tags(value) context = RequestContext(request, ctx) return render_to_response(template, context)
"nw": "Northern Mariana Islands", "wvu": "West Virginia", "-xxr": "Soviet Union", "-tar": "Tajik S.S.R.", "bcc": "British Columbia" } if __name__ == '__main__': import json import re import ox from ox.cache import read_url url = "http://www.loc.gov/marc/countries/countries_code.html" data = read_url(url).decode('utf-8') countries = dict([ [ox.strip_tags(c) for c in r] for r in re.compile('<tr>.*?class="code">(.*?)</td>.*?<td>(.*?)</td>', re.DOTALL).findall(data) ]) data = json.dumps(countries, indent=4, ensure_ascii=False).encode('utf-8') with open(__file__) as f: pydata = f.read() pydata = re.sub( re.compile('\nCOUNTRIES = {.*?}\n\n', re.DOTALL), '\nCOUNTRIES = %s\n\n' % data, pydata) with open(__file__, 'w') as f: f.write(pydata)
def parse_text(html, title): return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()