def get_data(piratebayId): _key_map = { 'spoken language(s)': u'language', 'texted language(s)': u'subtitle language', 'by': u'uploader', 'leechers': 'leecher', 'seeders': 'seeder', } piratebayId = get_id(piratebayId) torrent = dict() torrent[u'id'] = piratebayId torrent[u'domain'] = 'thepiratebay.org' torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId data = read_url(torrent['comment_link'], unicode=True) torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>') if not torrent[u'title']: return None torrent[u'title'] = decode_html(torrent[u'title']).strip() torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') title = quote(torrent['title'].encode('utf-8')) torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) torrent[key] = value torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>') if torrent[u'description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = get_torrent_info(t) return torrent
def get_path_info(self): data = {} for key in self.PATH_INFO: data[key] = self.info.get(key, None) if self.item: for key in self.ITEM_INFO: data[key] = self.item.get(key) if isinstance(data[key], basestring): data[key] = ox.decode_html(data[key]) elif isinstance(data[key], list): data[key] = [ox.decode_html(e) for e in data[key]] if self.item.get('series'): data['isEpisode'] = True data['directorSort'] = [get_name_sort(n) for n in self.item.get('director', [])] data['isEpisode'] = 'isEpisode' in data \ or data.get('season') != None \ or data.get('episode') != None \ or data.get('episodes') not in ([], None) \ or (data.get('seriesTitle') != None and data.get('episodeTitle') != None) if data['isEpisode'] and data['seriesYear'] == None: data['seriesYear'] = data['year'] data['type'] = 'unknown' if 'extension' in data and data['extension']: data['extension'] = data['extension'].lower() for type in ox.movie.EXTENSIONS: if data['extension'] in ox.movie.EXTENSIONS[type]: data['type'] = type if 'part' in data and isinstance(data['part'], int): data['part'] = str(data['part']) return data
def get_data(mininovaId): _key_map = { 'by': u'uploader', } mininovaId = get_id(mininovaId) torrent = dict() torrent[u'id'] = mininovaId torrent[u'domain'] = 'mininova.org' torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True) if '<h1>Torrent not found...</h1>' in data: return None for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): key = d[0].lower().strip() key = _key_map.get(key, key) value = decode_html(strip_tags(d[1].strip())) torrent[key] = value torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>') torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})') torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>') if torrent['description']: torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip() t = read_url(torrent[u'torrent_link']) torrent[u'torrent_info'] = get_torrent_info(t) return torrent
def find(query=None, user=None, timeout=60): if user: url = "https://twitter.com/" + quote(user) else: url = "https://twitter.com/search/" + quote(query) data = ox.cache.read_url(url, timeout=timeout).decode("utf-8") doc = lxml.html.document_fromstring(data) tweets = [] for e in doc.xpath("//div[contains(@class, 'original-tweet')]"): t = lxml.html.tostring(e) text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0] html = lxml.html.tostring(text, encoding="unicode").strip() text = ox.decode_html(ox.strip_tags(html)).strip() user = re.compile('data-name="(.*?)"').findall(t)[0] user = ox.decode_html(ox.strip_tags(user)).strip() tweets.append( { "id": re.compile('data-tweet-id="(\d+)"').findall(t)[0], "user-id": re.compile('data-user-id="(\d+)"').findall(t)[0], "name": re.compile('data-screen-name="(.*?)"').findall(t)[0], "time": datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])), "user": user, "text": text, "html": html, } ) return tweets
def get_path_info(self): data = {} for key in self.PATH_INFO: data[key] = self.info.get(key, None) if self.item: for key in self.ITEM_INFO: data[key] = self.item.get(key) if isinstance(data[key], basestring): data[key] = ox.decode_html(data[key]) elif isinstance(data[key], list): data[key] = [ox.decode_html(e) for e in data[key]] if self.item.get('series'): data['isEpisode'] = True data['directorSort'] = [ get_name_sort(n) for n in self.item.get('director', []) ] data['isEpisode'] = 'isEpisode' in data \ or data.get('season') != None \ or data.get('episode') != None \ or data.get('episodes') not in ([], None) \ or (data.get('seriesTitle') != None and data.get('episodeTitle') != None) if data['isEpisode'] and data['seriesYear'] == None: data['seriesYear'] = data['year'] data['type'] = 'unknown' if 'extension' in data and data['extension']: data['extension'] = data['extension'].lower() for type in ox.movie.EXTENSIONS: if data['extension'] in ox.movie.EXTENSIONS[type]: data['type'] = type if 'part' in data and isinstance(data['part'], int): data['part'] = str(data['part']) return data
def get_data(url): if not url.startswith('http:'): url = get_url(url) data = read_url(url, unicode=True) m = { 'id': get_id(url), 'url': url, 'type': re.compile('ubu.com/(.*?)/').findall(url)[0] } for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data): if videourl.endswith('.srt'): m['srt'] = videourl elif not 'video' in m: m['video'] = videourl m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20') if m['video'] == 'http://ubumexico.centro.org.mx/video/': del m['video'] m['title'] = strip_tags(decode_html(title)).strip() if not 'url' in m: print url, 'missing' if 'title' in m: m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title']) match = re.compile("flashvars','file=(.*?.flv)'").findall(data) if match: m['flv'] = match[0] m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20') y = re.compile('\((\d{4})\)').findall(data) if y: m['year'] = int(y[0]) d = re.compile('Director: (.+)').findall(data) if d: m['director'] = strip_tags(decode_html(d[0])).strip() a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0][1])).strip() else: a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data) if a: m['artist'] = strip_tags(decode_html(a[0])).strip() elif m['id'] == 'film/lawder_color': m['artist'] = 'Standish Lawder' if 'artist' in m: m['artist'] = m['artist'].replace('in UbuWeb Film', '') m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip() if m['id'] == 'film/coulibeuf': m['title'] = 'Balkan Baroque' m['year'] = 1999 return m
def find(query, timeout=ox.cache.cache_timeout): if isinstance(query, unicode): query = query.encode('utf-8') params = urllib.urlencode({'q': query}) url = 'http://duckduckgo.com/html/?' + params data = read_url(url, timeout=timeout).decode('utf-8') results = [] regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>' for r in re.compile(regex, re.DOTALL).findall(data): results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2])))) return results
def find_movies(query=None, imdb=None, max_results=10): if imdb: query = "tt" + normalize_imdbid(imdb) results = [] next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] page_count = 1 while next and page_count < 4: page_count += 1 url = next[0] if not url.startswith('http'): if not url.startswith('/'): url = "/" + url url = "http://thepiratebay.org" + url data = read_url(url, timeout=cache_timeout, unicode=True) regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>''' for row in re.compile(regexp, re.DOTALL).findall(data): torrentType = row[0] torrentLink = "http://thepiratebay.org" + row[1] torrentTitle = decode_html(row[2]) # 201 = Movies , 202 = Movie DVDR, 205 TV Shows if torrentType in ['201']: results.append((torrentTitle, torrentLink, '')) if len(results) >= max_results: return results next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data) return results
def lookup(id): logger.debug('lookup %s', id) r = {'asin': [id]} url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) logger.debug('%s', url) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") if r["title"] == 'Error!': return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re( data, '<span class="title">%s:</span>(.*?)</li>' % re.escape(keys[key])) if r[key] == '--' or not r[key]: del r[key] if key == 'pages' and key in r: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = decode_html(strip_tags(desc)) r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace( '._SL160_', '') for key in r: if isinstance(r[key], str): r[key] = decode_html(strip_tags(r[key])).strip() if 'author' in r and isinstance(r['author'], str) and r['author']: r['author'] = [r['author']] else: r['author'] = [] if not r['author'] or r['author'][0].isupper(): del r['author'] if r['description'].lower( ) == 'Description of this item is not available at this time.'.lower(): r['description'] = '' return r
def get_matches(obj, model, layer_type, qs=None): super_matches = obj.get_super_matches() exact = [l['id'] for l in filter(lambda l: l['type'] == layer_type, settings.CONFIG['layers'])] if exact: q = Q(value__iexact=obj.name) for name in obj.alternativeNames: q = q|Q(value__iexact=name) f = q&Q(layer__in=exact) else: f = None has_type = 'has%ss' % layer_type.capitalize() contains = [l['id'] for l in filter(lambda l: l.get(has_type), settings.CONFIG['layers'])] if contains: name = ox.decode_html(obj.name) q = Q(findvalue__icontains=" " + name)|Q(findvalue__istartswith=name) for name in obj.alternativeNames: name = ox.decode_html(name) q = q|Q(findvalue__icontains=" " + name)|Q(findvalue__istartswith=name) contains_matches = q&Q(layer__in=contains) if f: f = contains_matches | f else: f = contains_matches matches = [] if not qs: qs = Annotation.objects.all() for a in qs.filter(f): if a.findvalue: value = a.findvalue.lower() for name in super_matches: name = ox.decode_html(name) value = value.replace(name.lower(), '') for name in [obj.name] + list(obj.alternativeNames): name = name.lower() name = ox.decode_html(name) if name in value and (exact or re.compile('((^|\s)%s([\.,;:!?\-\/\s]|$))'%re.escape(name)).findall(value)): matches.append(a.id) break if not matches: matches = [-1] return Annotation.objects.filter(id__in=matches)
def lookup(id): logger.debug('lookup %s', id) r = { 'asin': [id] } url = '%s/Lookup/Book/%s/%s/1' % (base, id, id) logger.debug('%s', url) data = read_url(url).decode('utf-8') r["title"] = find_re(data, "<h2>(.*?)</h2>") if r["title"] == 'Error!': return {} keys = { 'author': 'Author(s)', 'publisher': 'Publisher', 'date': 'Publication date', 'edition': 'Edition', 'binding': 'Binding', 'volume': 'Volume(s)', 'pages': 'Pages', } for key in keys: r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key])) if r[key] == '--' or not r[key]: del r[key] if key == 'pages' and key in r: r[key] = int(r[key]) desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ') desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ') r['description'] = decode_html(strip_tags(desc)) r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '') for key in r: if isinstance(r[key], str): r[key] = decode_html(strip_tags(r[key])).strip() if 'author' in r and isinstance(r['author'], str) and r['author']: r['author'] = [r['author']] else: r['author'] = [] if not r['author'] or r['author'][0].isupper(): del r['author'] if r['description'].lower() == 'Description of this item is not available at this time.'.lower(): r['description'] = '' return r
def update_matches(id, type): if type == 'place': from place.models import Place as Model elif type == 'event': from event.models import Event as Model a = models.Annotation.objects.get(pk=id) a_matches = getattr(a, type == 'place' and 'places' or 'events') #remove undefined matches that only have this annotation for p in a_matches.filter(defined=False).exclude(name=a.value): if p.annotations.exclude(id=id).count() == 0: p.delete() if a.get_layer().get('type') == type and a_matches.count() == 0: a_matches.add(Model.get_or_create(a.value)) for p in a_matches.all(): p.update_matches() if a.findvalue: names = {} for n in Model.objects.all().values('id', 'name', 'alternativeNames'): names[n['id']] = [ ox.decode_html(x) for x in [n['name']] + json.loads(n['alternativeNames']) ] value = a.findvalue.lower() current = [p.id for p in a_matches.all()] matches = [] name_matches = [] for i in names: for name in names[i]: if name.lower() in value: matches.append(i) name_matches.append(name.lower()) break new = [] for i in matches: p = Model.objects.get(pk=i) #only add places/events that did not get added as a super match #i.e. only add The Paris Region and not Paris if not filter(lambda n: n in name_matches, [n.lower() for n in p.get_super_matches()]): new.append(i) removed = filter(lambda p: p not in new, current) added = filter(lambda p: p not in current, new) update = removed + added if update: for e in Model.objects.filter(id__in=update): e.update_matches(models.Annotation.objects.filter(pk=a.id)) else: #annotation has no value, remove all exisint matches for e in a_matches.all(): e.update_matches(models.Annotation.objects.filter(pk=a.id))
def info(id, timeout=cache_timeout): info = {} if id.startswith('http'): id = get_id(id) if not id: return info url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id data = read_url(url, timeout=timeout) xml = parseString(data) info['id'] = id info['url'] = get_url(id) info['title'] = xml.getElementsByTagName('title')[0].firstChild.data info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0] info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data info['categories'] = [] for cat in xml.getElementsByTagName('media:category'): info['categories'].append(cat.firstChild.data) k = xml.getElementsByTagName('media:keywords')[0].firstChild if k: info['keywords'] = k.data.split(', ') data = read_url(info['url'], timeout=timeout) match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data) if match: info['license'] = match[0].strip() info['license'] = re.sub('<.+?>', '', info['license']).strip() url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id data = read_url(url, timeout=timeout) xml = parseString(data) languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')] if languages: info['subtitles'] = {} for language in languages: url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language) data = read_url(url, timeout=timeout) xml = parseString(data) subs = [] for t in xml.getElementsByTagName('text'): start = float(t.getAttribute('start')) duration = t.getAttribute('dur') if not duration: duration = '2' end = start + float(duration) text = t.firstChild.data subs.append({ 'in': start, 'out': end, 'value': ox.decode_html(text), }) info['subtitles'][language] = subs return info
def get_data(id): url = "http://www.amazon.com/title/dp/%s/" % id data = read_url(url, unicode=True) def find_data(key): return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip() r = {} r['amazon'] = url r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>') r['authors'] = [] doc = lxml.html.document_fromstring(data) for e in doc.xpath("//span[contains(@class, 'author')]"): print e for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"): if 'Author' in secondary.text: author = e.xpath(".//span[contains(@class, 'a-size-medium')]") if author: r['authors'].append(author[0].text.strip()) else: r['authors'].append(e.xpath('.//a')[0].text.strip()) break elif 'Translator' in secondary.text: r['translator'] = [e.xpath('.//a')[0].text] break r['publisher'] = find_data('Publisher') r['language'] = find_data('Language') r['isbn-10'] = find_data('ISBN-10') r['isbn-13'] = find_data('ISBN-13').replace('-', '') r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>') r['pages'] = find_data('Paperback') if not r['pages']: r['pages'] = find_data('Hardcover') r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() for e in doc.xpath('//noscript'): for c in e.getchildren(): if c.tag == 'div': r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip() break r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) if r['cover']: r['cover'] = r['cover'][0].split('._BO2')[0] if not r['cover'].endswith('.jpg'): r['cover'] = r['cover'] + '.jpg' if 'no-image-avail-img' in r['cover']: del r['cover'] else: del r['cover'] return r
def move(self): def format_underscores(string): return re.sub('^\.|\.$|:|/|\?|<|>', '_', string) prefs = settings.preferences prefix = os.path.join(os.path.expanduser(prefs['libraryPath']), 'Books/') j = self.item.json() current_path = self.fullpath() if not os.path.exists(current_path): logger.debug('file is missing. %s', current_path) return author = '; '.join([get_sort_name(a) for a in j.get('author', [])]) if not author: author = 'Unknown Author' title = j.get('title', 'Untitled') extension = j['extension'] if len(title) > 100: title = title[:100] title = format_underscores(title) author = format_underscores(author) publisher = j.get('publisher') if publisher: extra = ', '.join(publisher) else: extra = '' date = j.get('date') if date and len(date) >= 4: extra += ' ' + date[:4] if extra: title = '%s (%s)' % (title, extra.strip()) filename = '%s.%s' % (title, extension) first = unicodedata.normalize('NFD', author[0].upper())[0].upper() new_path = os.path.join(first, author, filename) new_path = new_path.replace('\x00', '') new_path = ox.decode_html(new_path) if self.path == new_path: return h = '' while os.path.exists(os.path.join(prefix, new_path)): h = self.sha1[:len(h)+1] filename = '%s.%s.%s' % (title, h, extension) first = unicodedata.normalize('NFD', author[0].upper())[0].upper() new_path = os.path.join(first, author, filename) if current_path == os.path.join(prefix, new_path): break if self.path != new_path: path = os.path.join(prefix, new_path) ox.makedirs(os.path.dirname(path)) shutil.move(current_path, path) self.path = new_path self.save()
def subreddit(name, offset=0, n=0, timeout=cache_timeout): url = 'http://www.reddit.com/r/%s/' % name if offset: url += '?count=%d' % offset data = read_url(url, unicode=True, timeout=timeout) more = True links = [] while more: l = re.compile('<a class="title " href="(.*?)".*?>(.*?)<\/a>').findall(data) if l: links += [{ 'url': ox.decode_html(a[0]), 'title': ox.decode_html(a[1]) } for a in l] more = re.compile('<a href="(.*?)" rel="nofollow next" >next ›<\/a>').findall(data) if more and (n == 0 or len(links) < n): url = ox.decode_html(more[0].split('"')[-1]) data = read_url(url, unicode=True) else: more = False return links
def update_matches(id, type): if type == 'place': from place.models import Place as Model elif type == 'event': from event.models import Event as Model a = models.Annotation.objects.get(pk=id) a_matches = getattr(a, type == 'place' and 'places' or 'events') #remove undefined matches that only have this annotation for p in a_matches.filter(defined=False).exclude(name=a.value): if p.annotations.exclude(id=id).count() == 0: p.delete() if a.get_layer().get('type') == type and a_matches.count() == 0: a_matches.add(Model.get_or_create(a.value)) for p in a_matches.all(): p.update_matches() if a.findvalue: names = {} for n in Model.objects.all().values('id', 'name', 'alternativeNames'): names[n['id']] = [ox.decode_html(x) for x in [n['name']] + json.loads(n['alternativeNames'])] value = a.findvalue.lower() current = [p.id for p in a_matches.all()] matches = [] name_matches = [] for i in names: for name in names[i]: if name.lower() in value: matches.append(i) name_matches.append(name.lower()) break new = [] for i in matches: p = Model.objects.get(pk=i) #only add places/events that did not get added as a super match #i.e. only add The Paris Region and not Paris if not filter(lambda n: n in name_matches, [n.lower() for n in p.get_super_matches()]): new.append(i) removed = filter(lambda p: p not in new, current) added = filter(lambda p: p not in current, new) update = removed + added if update: for e in Model.objects.filter(id__in=update): e.update_matches(models.Annotation.objects.filter(pk=a.id)) else: #annotation has no value, remove all exisint matches for e in a_matches.all(): e.update_matches(models.Annotation.objects.filter(pk=a.id))
def _parse_results_page(data, max_results=10): results=[] regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>''' for row in re.compile(regexp, re.DOTALL).findall(data): torrentDate = row[0] torrentExtra = row[1] torrentId = row[2] torrentTitle = decode_html(row[3]).strip() torrentLink = "http://www.mininova.org/tor/" + torrentId privateTracker = 'priv.gif' in torrentExtra if not privateTracker: results.append((torrentTitle, torrentLink, '')) return results
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description >>> find("The Matrix site:imdb.com", 1)[0][0] u'The Matrix (1999) - IMDb' >>> find("The Matrix site:imdb.com", 1)[0][1] u'http://www.imdb.com/title/tt0133093/' """ results = [] offset = 0 while len(results) < max_results: url = 'http://google.com/search?q=%s' % quote_plus(query) if offset: url += '&start=%d' % offset data = read_url(url, timeout=timeout) data = re.sub('<span class="f">(.*?)</span>', '\\1', data) for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data): results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2])))) if len(results) >= max_results: break offset += 10 return results
def addEvent(request): ''' takes { name: string, start: string, end: string } returns { id: string } ''' data = json.loads(request.POST['data']) existing_names = [] exists = False names = [data['name']] + data.get('alternativeNames', []) for name in names: name = ox.decode_html(name) if models.Event.objects.filter( defined=True, name_find__icontains=u'|%s|' % name).count() != 0: exists = True existing_names.append(name) if not exists: models.Event.objects.filter(defined=False, name__in=names).delete() data['name'] = ox.escape_html(data['name']) event = models.Event(name=data['name']) for key in ('start', 'startTime', 'end', 'endTime', 'duration', 'durationTime', 'type', 'alternativeNames'): if key in data and data[key]: value = data[key] if isinstance(value, basestring): value = ox.escape_html(value) if key == 'alternativeNames': value = tuple([ox.escape_html(v) for v in value]) setattr(event, key, value) if 'nameSort' in data: value = ox.escape_html(data['nameSort']) event.set_name_sort(value) event.matches = 0 event.save() event.update_matches() response = json_response(status=200, text='created') response['data'] = event.json() else: response = json_response(status=409, text='name exists') response['data']['names'] = existing_names return render_to_json_response(response)
def addEvent(request): ''' takes { name: string, start: string, end: string } returns { id: string } ''' data = json.loads(request.POST['data']) existing_names = [] exists = False names = [data['name']] + data.get('alternativeNames', []) for name in names: name = ox.decode_html(name) if models.Event.objects.filter(defined=True, name_find__icontains=u'|%s|'%name).count() != 0: exists = True existing_names.append(name) if not exists: models.Event.objects.filter(defined=False, name__in=names).delete() data['name'] = ox.escape_html(data['name']) event = models.Event(name=data['name']) for key in ('start', 'startTime', 'end', 'endTime', 'duration', 'durationTime', 'type', 'alternativeNames'): if key in data and data[key]: value = data[key] if isinstance(value, basestring): value = ox.escape_html(value) if key == 'alternativeNames': value = tuple([ox.escape_html(v) for v in value]) setattr(event, key, value) if 'nameSort' in data: value = ox.escape_html(data['nameSort']) event.set_name_sort(value) event.matches = 0 event.save() event.update_matches() response = json_response(status=200, text='created') response['data'] = event.json() else: response = json_response(status=409, text='name exists') response['data']['names'] = existing_names return render_to_json_response(response)
def info(epub): data = {} try: z = zipfile.ZipFile(epub) except zipfile.BadZipFile: logger.debug('invalid epub file %s', epub) return data opf = [f.filename for f in z.filelist if f.filename.endswith('opf')] if opf: info = ET.fromstring(z.read(opf[0])) metadata = info.findall('{http://www.idpf.org/2007/opf}metadata') if metadata: metadata = metadata[0] for e in metadata.getchildren(): if e.text and e.text.strip() and e.text not in ('unknown', 'none'): key = e.tag.split('}')[-1] key = { 'creator': 'author', }.get(key, key) value = e.text.strip() if key == 'identifier': value = normalize_isbn(value) if stdnum.isbn.is_valid(value): data['isbn'] = [value] elif key == 'author': data[key] = value.split(', ') else: data[key] = value if 'description' in data: data['description'] = strip_tags(decode_html(data['description'])) text = extract_text(epub) data['textsize'] = len(text) if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'date' in data and 'T' in data['date']: data['date'] = data['date'].split('T')[0] if 'language' in data and isinstance(data['language'], str): data['language'] = get_language(data['language']) return data
def save(self, *args, **kwargs): set_public_id = not self.id or not self.public_id layer = self.get_layer() if self.value: self.value = utils.cleanup_value(self.value, layer['type']) self.findvalue = ox.decode_html(ox.strip_tags(re.sub('<br */?>\n?', ' ', self.value))).replace('\n', ' ') self.findvalue = unicodedata.normalize('NFKD', self.findvalue).lower() sortvalue = sort_string(self.findvalue) if sortvalue: self.sortvalue = sortvalue[:900] else: self.sortvalue = None else: self.findvalue = None self.sortvalue = None #no clip or update clip if self.layer in settings.CONFIG.get('clipLayers', []): if not self.clip or self.start != self.clip.start or self.end != self.clip.end: self.clip, created = Clip.get_or_create(self.item, self.start, self.end) elif self.clip: self.clip = None super(Annotation, self).save(*args, **kwargs) if set_public_id: self.set_public_id() if self.clip: Clip.objects.filter(**{ 'id': self.clip.id, self.layer: False }).update(**{self.layer: True}) #update clip.findvalue self.clip.save() #editAnnotations needs to be in snyc if layer.get('type') == 'place' or layer.get('hasPlaces'): update_matches(self.id, 'place') if layer.get('type') == 'event' or layer.get('hasEvents'): update_matches(self.id, 'event')
def info(key, value): if key not in ('isbn',): raise IOError('unknwon key %s' % key) if len(value) == 13: value = stdnum.isbn.to_isbn10(value) if len(value) != 10: raise IOError('invalid isbn %s' % value) url = 'http://www.amazon.com/dp/' + value data = read_url(url).decode() doc = lxml.html.document_fromstring(data) info = {} if '<title>404 - Document Not Found</title>' in data: return info if 'To discuss automated access to Amazon data please' in data: return info for l in doc.xpath('//link[@rel="canonical" and @href]'): info['asin'] = [l.get('href').rpartition('/')[-1]] break info['title'] = strip_tags(decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['description'] = strip_tags(decode_html(unquote(re.compile('encodedDescription\' : "(.*?)",').findall(data)[0]))) info['description'] = fix_bad_unicode(info['description']) content = doc.xpath('//div[@class="content"]')[0] content_info = {} for li in content.xpath('.//li'): v = li.text_content() if ': ' in v: k, v = li.text_content().split(': ', 1) content_info[k.strip()] = v.strip() if 'Language' in content_info: info['language'] = content_info['Language'] if 'Publisher' in content_info: if ' (' in content_info['Publisher']: info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}') info['publisher'] = content_info['Publisher'].split(' (')[0] if '; ' in info['publisher']: info['publisher'], info['edition'] = info['publisher'].split('; ', 1) if 'ISBN-13' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-13'].replace('-', '')) if 'ISBN-10' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-10']) a = doc.xpath('//span[@class="a-size-medium"]') if a: for span in a: r = span.getchildren()[0].text.strip() role = get_role(r) if not role in info: info[role] = [] info[role].append(span.text.strip()) else: for span in doc.xpath('//span[@class="author notFaded"]'): author = [x.strip() for x in span.text_content().strip().split('\n') if x.strip()] role = get_role(author[-1]) if not role in info: info[role] = [] info[role].append(author[0]) covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = json.loads(decode_html(covers)) last = [0,0] for url in covers: if covers[url] > last: last = covers[url] info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) return info
def item(request, id): id = id.split('/')[0] template = 'index.html' level = settings.CONFIG['capabilities']['canSeeItem']['guest'] if not request.user.is_anonymous(): level = request.user.get_profile().level qs = models.Item.objects.filter(itemId=id, level__lte=level) if qs.count() == 0: context = RequestContext(request, { 'base_url': request.build_absolute_uri('/'), 'settings': settings }) else: item = qs[0] template = 'item.html' keys = [ 'year', 'director', 'topic', 'summary' ] data = [] for key in keys: value = item.get(key) if value: if isinstance(value, list): value = value = u', '.join([unicode(v) for v in value]) data.append({'key': key.capitalize(), 'value': value}) clips = [] clip = {'in': 0, 'annotations': []} #logged in users should have javascript. not adding annotations makes load faster if request.user.is_anonymous(): for a in item.annotations.filter( layer__in=models.Annotation.public_layers()).order_by('start', 'end', 'sortvalue'): if clip['in'] < a.start: if clip['annotations']: clip['annotations'] = '<br />\n'.join(clip['annotations']) clips.append(clip) clip = {'in': a.start, 'annotations': []} clip['annotations'].append(a.value) ctx = { 'current_url': request.build_absolute_uri(request.get_full_path()), 'base_url': request.build_absolute_uri('/'), 'url': request.build_absolute_uri('/%s' % id), 'id': id, 'settings': settings, 'data': data, 'clips': clips, 'icon': settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon' or 'poster', 'title': ox.decode_html(item.get('title', '')), 'description': item.get_item_description() } if not settings.USE_IMDB: value = item.get('topic' in keys and 'topic' or 'keywords') if isinstance(value, list): value = value = ', '.join(value) if value: ctx['keywords'] = ox.strip_tags(value) context = RequestContext(request, ctx) return render_to_response(template, context)
def contact(request): ''' takes { email: string, subject: string, message: string } returns { } ''' data = json.loads(request.POST['data']) name = data.get('name', '') email = data.get('email', '') if request.user.is_authenticated(): if not name: name = request.user.username if not email: email = request.user.email if 'message' in data and data['message'].strip(): email_from = '"%s" <%s>' % (settings.SITENAME, settings.CONFIG['site']['email']['system']) email_to = [ settings.CONFIG['site']['email']['contact'], ] subject = data.get('subject', '').strip() template = loader.get_template('contact_email.txt') context = RequestContext( request, { 'name': name, 'email': email, 'subject': subject, 'message': ox.decode_html(data['message']).strip(), 'sitename': settings.SITENAME, 'footer': settings.CONFIG['site']['email']['footer'], 'url': request.build_absolute_uri('/'), }) subject = ox.decode_html(subject) message = ox.decode_html(template.render(context)) response = json_response(text='message sent') try: send_mail(u'%s Contact - %s' % (settings.SITENAME, subject), message, email_from, email_to) except BadHeaderError: response = json_response(status=400, text='invalid data') if request.user.is_authenticated() \ and 'receipt' in data \ and data['receipt']: template = loader.get_template('contact_receipt.txt') context = RequestContext( request, { 'name': name, 'from': email, 'sitename': settings.SITENAME, 'footer': settings.CONFIG['site']['email']['footer'], 'to': email_to[0], 'subject': subject, 'message': data['message'].strip(), 'url': request.build_absolute_uri('/'), }) message = template.render(context) try: send_mail('Fwd: %s' % subject, message, email_from, [email]) except: pass else: response = json_response(status=400, text='invalid data') return render_to_json_response(response)
def info(key, value): if key not in ('isbn', ): raise IOError('unknwon key %s' % key) if len(value) == 13: value = stdnum.isbn.to_isbn10(value) if len(value) != 10: raise IOError('invalid isbn %s' % value) url = 'http://www.amazon.com/dp/' + value data = read_url(url).decode() doc = lxml.html.document_fromstring(data) info = {} if '<title>404 - Document Not Found</title>' in data: return info if 'To discuss automated access to Amazon data please' in data: return info for l in doc.xpath('//link[@rel="canonical" and @href]'): info['asin'] = [l.get('href').rpartition('/')[-1]] break info['title'] = strip_tags( decode_html(doc.xpath('//span[@id="productTitle"]')[0].text)) info['title'] = re.sub(' \([^\)]+? Classics\)', '', info['title']) info['title'] = re.sub(' \([^\)]+? Collection\)', '', info['title']) info['description'] = strip_tags( decode_html( unquote( re.compile('encodedDescription\' : "(.*?)",').findall(data) [0]))) info['description'] = fix_bad_unicode(info['description']) content = doc.xpath('//div[@class="content"]')[0] content_info = {} for li in content.xpath('.//li'): v = li.text_content() if ': ' in v: k, v = li.text_content().split(': ', 1) content_info[k.strip()] = v.strip() if 'Language' in content_info: info['language'] = content_info['Language'] if 'Publisher' in content_info: if ' (' in content_info['Publisher']: info['date'] = find_re(content_info['Publisher'].split(' (')[-1], '\d{4}') info['publisher'] = content_info['Publisher'].split(' (')[0] if '; ' in info['publisher']: info['publisher'], info['edition'] = info['publisher'].split( '; ', 1) if 'ISBN-13' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-13'].replace('-', '')) if 'ISBN-10' in content_info: if not 'isbn' in info: info['isbn'] = [] info['isbn'].append(content_info['ISBN-10']) a = doc.xpath('//span[@class="a-size-medium"]') if a: for span in a: r = span.getchildren()[0].text.strip() role = get_role(r) if not role in info: info[role] = [] info[role].append(span.text.strip()) else: for span in doc.xpath('//span[@class="author notFaded"]'): author = [ x.strip() for x in span.text_content().strip().split('\n') if x.strip() ] role = get_role(author[-1]) if not role in info: info[role] = [] info[role].append(author[0]) covers = re.compile('data-a-dynamic-image="({.+?})"').findall(data)[0] covers = json.loads(decode_html(covers)) last = [0, 0] for url in covers: if covers[url] > last: last = covers[url] info['cover'] = re.sub('(\._SX.+?_\.)', '.', url) return info
def contact(request): ''' takes { email: string, subject: string, message: string } returns { } ''' data = json.loads(request.POST['data']) name = data.get('name', '') email = data.get('email', '') if request.user.is_authenticated(): if not name: name = request.user.username if not email: email = request.user.email if 'message' in data and data['message'].strip(): email_from = '"%s" <%s>' % (settings.SITENAME, settings.CONFIG['site']['email']['system']) email_to = [settings.CONFIG['site']['email']['contact'], ] subject = data.get('subject', '').strip() template = loader.get_template('contact_email.txt') context = RequestContext(request, { 'name': name, 'email': email, 'subject': subject, 'message': ox.decode_html(data['message']).strip(), 'sitename': settings.SITENAME, 'footer': settings.CONFIG['site']['email']['footer'], 'url': request.build_absolute_uri('/'), }) subject = ox.decode_html(subject) message = ox.decode_html(template.render(context)) response = json_response(text='message sent') try: send_mail(u'%s Contact - %s' % (settings.SITENAME, subject), message, email_from, email_to) except BadHeaderError: response = json_response(status=400, text='invalid data') if request.user.is_authenticated() \ and 'receipt' in data \ and data['receipt']: template = loader.get_template('contact_receipt.txt') context = RequestContext(request, { 'name': name, 'from': email, 'sitename': settings.SITENAME, 'footer': settings.CONFIG['site']['email']['footer'], 'to': email_to[0], 'subject': subject, 'message': data['message'].strip(), 'url': request.build_absolute_uri('/'), }) message = template.render(context) try: send_mail('Fwd: %s' % subject, message, email_from, [email]) except: pass else: response = json_response(status=400, text='invalid data') return render_to_json_response(response)
def addPlace(request): ''' takes { name: "", alternativeNames: [], geoname: "", countryCode: '', south: float, west: float, north: float, east: float, lat: float, lng: float, area: float, type: "" } returns { id: string } ''' #FIXME: check permissions data = json.loads(request.POST['data']) exists = False existing_names = [] existing_geoname = '' name = data.pop('name') if name == '': _exists = True name = 'Untitled' n = 0 while _exists: _exists = models.Place.objects.filter(defined=True, name_find__icontains=u'|%s|'%name).count() > 0 if _exists: name = 'Untitled [%s]' %n n += 1 names = [name] + data.get('alternativeNames', []) data['alternativeNames'] = [ox.escape_html(n) for n in data.get('alternativeNames', [])] name = ox.escape_html(name) for n in names: n = ox.decode_html(name) if models.Place.objects.filter(defined=True, name_find__icontains=u'|%s|'%n).count() != 0: exists = True existing_names.append(n) ''' if 'geoname' in data: if models.Place.objects.filter(defined=True, geoname=data['geoname']).count() > 0: exists = True existing_geoname = data['geoname'] ''' if not exists: models.Place.objects.filter(defined=False, name__in=names).delete() place = models.Place() place.user = request.user place.name = name place.alternativeNames = tuple(data.pop('alternativeNames', [])) for key in data: value = data[key] if isinstance(value, list): value = tuple(value) setattr(place, key, value) place.matches = 0 place.save() place.update_matches() response = json_response(place.json()) else: response = json_response(status=409, text='%s exists'%(existing_names and 'Name' or 'Geoname')) response['data']['names'] = existing_names if existing_geoname: response['data']['geoname'] = existing_geoname return render_to_json_response(response)
def item(request, id): id = id.split('/')[0] template = 'index.html' level = settings.CONFIG['capabilities']['canSeeItem']['guest'] if not request.user.is_anonymous(): level = request.user.get_profile().level qs = models.Item.objects.filter(itemId=id, level__lte=level) if qs.count() == 0: context = RequestContext(request, { 'base_url': request.build_absolute_uri('/'), 'settings': settings }) else: item = qs[0] template = 'item.html' keys = ['year', 'director', 'topic', 'summary'] data = [] for key in keys: value = item.get(key) if value: if isinstance(value, list): value = value = u', '.join([unicode(v) for v in value]) data.append({'key': key.capitalize(), 'value': value}) clips = [] clip = {'in': 0, 'annotations': []} #logged in users should have javascript. not adding annotations makes load faster if request.user.is_anonymous(): for a in item.annotations.filter( layer__in=models.Annotation.public_layers()).order_by( 'start', 'end', 'sortvalue'): if clip['in'] < a.start: if clip['annotations']: clip['annotations'] = '<br />\n'.join( clip['annotations']) clips.append(clip) clip = {'in': a.start, 'annotations': []} clip['annotations'].append(a.value) ctx = { 'current_url': request.build_absolute_uri(request.get_full_path()), 'base_url': request.build_absolute_uri('/'), 'url': request.build_absolute_uri('/%s' % id), 'id': id, 'settings': settings, 'data': data, 'clips': clips, 'icon': settings.CONFIG['user']['ui']['icons'] == 'frames' and 'icon' or 'poster', 'title': ox.decode_html(item.get('title', '')), 'description': item.get_item_description() } if not settings.USE_IMDB: value = item.get('topic' in keys and 'topic' or 'keywords') if isinstance(value, list): value = value = ', '.join(value) if value: ctx['keywords'] = ox.strip_tags(value) context = RequestContext(request, ctx) return render_to_response(template, context)
def editPlace(request): ''' takes { id: string, name: string north: int } returns { names: [] } ''' data = json.loads(request.POST['data']) place = get_object_or_404_json(models.Place, pk=ox.fromAZ(data['id'])) names = data.get('name', []) if isinstance(names, basestring): names = [names] names = [ox.escape_html(n) for n in names] alternative_names = [ox.escape_html(n) for n in data.get('alternativeNames', [])] alternative_names = filter(lambda n: n.strip(), alternative_names) if place.editable(request.user): conflict = False conflict_names = [] conflict_geoname = '' if alternative_names: data['alternativeNames'] = alternative_names for name in names + alternative_names: name = ox.decode_html(name) if models.Place.objects.filter(defined=True, name_find__icontains=u'|%s|'%name).exclude(id=place.id).count() != 0: conflict = True conflict_names.append(name) ''' if 'geoname' in data: if models.Place.objects.filter(defined=True, geoname=data['geoname']).exclude(id=place.id).count() != 0: conflict = True conflict_geoname = data['geoname'] ''' if not conflict: models.Place.objects.filter(defined=False, name__in=names+alternative_names).delete() for key in data: if key != 'id': value = data[key] if isinstance(value, basestring): value = ox.escape_html(value) if isinstance(value, list): value = tuple(value) setattr(place, key, value) place.save() if 'name' in data or 'alternativeNames' in data: place.update_matches() response = json_response(place.json()) else: response = json_response(status=409, text='%s exists'%(conflict_names and 'Name' or 'Geoname')) response['data']['names'] = conflict_names if conflict_geoname: response['data']['geoname'] = conflict_geoname else: response = json_response(status=403, text='permission denied') return render_to_json_response(response)