def test_scrape(self): og = opengraph.OpenGraph(url='http://graingert.co.uk/', required_attrs=("description",), scrape=True) self.assertTrue(og.is_valid()) self.assertTrue(og.items["description"]) og = opengraph.OpenGraph(url='http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html', required_attrs=("description",), scrape=True) self.assertEqual(og.items["description"], "Beautiful Soup Documentation")
def fetch_og_result(url): result = opengraph.OpenGraph(url=url) if result: if 'title' in result or 'description' in result: fetch_image_dimensions(result) return result return {}
def fetch_og_metadata(user_agent, links): res = [] for l in links: check_url(l) # Remove any AP actor from the list try: p = lookup(l) if p.has_type(ap.ACTOR_TYPES): continue except NotAnActivityError: pass r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15) r.raise_for_status() if not r.headers.get("content-type").startswith("text/html"): logger.debug(f"skipping {l}") continue r.encoding = "UTF-8" html = r.text try: data = dict(opengraph.OpenGraph(html=html)) except Exception: logger.exception(f"failed to parse {l}") continue if data.get("url"): res.append(data) return res
def test_no_json(self): if getattr(opengraph, 'import_json', None) is not None: # python2 opengraph.import_json = False else: # python3 opengraph.opengraph.import_json = False og = opengraph.OpenGraph(url='http://www.ogp.me/') self.assertEqual(og.to_json(), "{'error':'there isn't json module'}")
def parse_url(url): ogp_result = opengraph.OpenGraph(url=url) if ogp_result.is_valid(): return parse_valid_url(url, ogp_result) else: return parse_non_valid_url(url)
def get_opengraph(self, post, url): try: og = opengraph.OpenGraph(url=url) if og.is_valid(): post.opengraph = og post.save() except (URLError, HTTPError), e: raise self.retry(exc=e)
def parse_html(html): ogp_result = opengraph.OpenGraph(html=html) if ogp_result.is_valid() is True: return parse_valid_html(html, ogp_result) else: return parse_non_valid_html(html)
def rendered_wall_posts( wall_posts ): for wall_post in wall_posts: title = '' desc = '' site_image = '' article_title = '' urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', wall_post.data['post_content']) for url in urls: parse_obj = urlparse.urlparse(url) site = parse_obj.netloc path = parse_obj.path conn = httplib.HTTPConnection(site) conn.request('HEAD',path) response = conn.getresponse() conn.close() ctype = response.getheader('Content-Type') if response.status < 400 and ctype.startswith('image'): wall_post.data['post_content'] = wall_post.data['post_content']+"<br/><a href='"+url+"' target='_blank'><img width=300 src='"+url+"' target = '_blank'/></a>" else: og = opengraph.OpenGraph(url) if not len(og.items()) == 2: for x,y in og.items(): if x == 'type' and y == 'video': for k,l in og.items(): if k == 'site_name' and l == 'YouTube': url_data = urlparse.urlparse(url) query = urlparse.parse_qs(url_data.query) video = query["v"][0] wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe width='300' height='200' src='//www.youtube.com/embed/"+video+"' frameborder='0' allowfullscreen></iframe>" elif k == 'site_name' and l == 'Vimeo': url_data = urlparse.urlparse(url) video = url_data.path wall_post.data['post_content'] = wall_post.data['post_content'].replace(url,"")+"<br/><iframe src='//player.vimeo.com/video"+video+"' width='300' height='200' frameborder='0' webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe> <p></p>" elif x == 'type' and y == 'article': for k,l in og.items(): if k == 'title': article_title = l elif k == 'site_name': title = l elif k=='description': desc = l elif k=='image': site_image = l wall_post.data['post_content'] = wall_post.data['post_content'] +"<br/><table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+article_title+"</a><br/>"+title+"</td></td></table>" elif x=='type': for k,l in og.items(): if k == 'site_name': title = l elif k=='description': desc = l elif k=='image': site_image = l wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<table><tr><td><img width='50' src='"+site_image+"'</td><td><a href='"+url+"' target='_blank'/>"+title+"</a><br/>"+desc+"</td></td></table>") else: wall_post.data['post_content'] = wall_post.data['post_content'].replace(url, "<a href='"+url+"' target='_blank'>"+url+"</a>") return wall_posts
def otherThumb(self, url): try: site = opengraph.OpenGraph(url=url) except: return 'failed' if site.is_valid(): image = site.image else: return 'failed' return image
def extract_ograph_title(text): text_without_hashtag = ' '.join(text.split(' ')[1:]) url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]' \ + '|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' urls = re.findall(url_pattern, text_without_hashtag) if urls: content = opengraph.OpenGraph(url=urls[0]) title = content.get('title', text_without_hashtag) return urls[0], title.encode('utf-8') return None, text_without_hashtag.encode('utf-8')
def fetch_and_extract(url): data = opengraph.OpenGraph(url=url) p = Page() p.title = data.get("title", None) p.description = data.get("description", None) p.canon_url = data.get("url", None) p.image_url = data.get("image", None) p.ogp_type = data.get("type", None) p.fetch_url = data.get("_url", None) p.site_name = data.get("site_name", None) return p
def getProductInfo(product_url): '''gathering data from Open Graph''' prod_url = minimioche_url + product_url # print prod_url prod_site = opengraph.OpenGraph(url=prod_url) # product_data = dict.fromkeys(product_keys) if prod_site.is_valid(): product_data = json.loads(prod_site.to_json().encode('utf-8')) product_data['colors'] = getColors(prod_url) return product_data else: print prod_url, ' didnt work ' return 'na'
def create(self, data): try: group = Group.objects.get(id=self.context['view'].kwargs.get("pk")) except Exception: raise serializers.ValidationError({'error': 'Group Not found'}) current_user = self.context['request'].user data = self.context['request'].data return Post.objects.create(ogp=opengraph.OpenGraph( url=data["link"]).__str__() if 'link' in data else "", link=data.get("link", None), picture=data.get("picture", None), content=data["content"], group=group, creator=current_user)
async def unwatch(self, ctx, url: str): """Remove a YouTube/Twitch channel from the watch queue. XXX Only works with YouTube channels right now.""" if not url: await self.bot.responses.failure( title="No URL Specified", message="You need to give me a URL!") return found = False og = opengraph.OpenGraph(url=url) channel_url = og.get('url', '') if channel_url.startswith("https://www.youtube.com/channel/"): channel_id = channel_url.replace( "https://www.youtube.com/channel/", "") for i in range(len(self.checklist)): if self.checklist[i].channel_id == channel_id: for j in range(len(self.checklist[i].discord_channels)): if self.checklist[i].discord_channels[ j].channel == ctx.message.channel.id: self.checklist[i].discord_channels.pop(j) found = True if len(self.checklist[i].discord_channels) == 0: self.checklist.pop(i) break elif False: pass else: await self.bot.responses.failure( title="Not a YouTube/Twitch Channel", message= "The URL you have given me is not a YouTube/Twitch channel!") return if found: await self.bot.responses.basic( message="This channel has been removed!") else: await self.bot.responses.failure( title="Channel Never Watched", message= "I was never watching this YouTube/Twitch channel in this Discord channel!" ) self._save()
def media(entity_id=None): service_url = current_app.hypothesis_client.service # hypothesis_api_url = "https://hypothes.is/api/" hypothesis_api_url = service_url + '/api/' hypothesis_username = "******".format( username=current_user.username, authority=os.environ.get('HYPOTHESIS_AUTHORITY')) if entity_id is None: media_base_url = url_for("main.media", _external=True) entities_media = Entity.query.filter( Entity.description.like('media:%')).all() return render_template('main/media.html', data=entities_media, hypothesis_api_url=hypothesis_api_url, hypothesis_username=hypothesis_username, media_base_url=media_base_url) if sys.version_info[0] < 3: import opengraph else: import opengraph_py3 as opengraph entity_meta = EntityMeta.query \ .filter_by(entity_id=entity_id) \ .filter(EntityMeta.type_.like("opengraph_url")) \ .all() if not entity_meta: data = None else: entity_meta = entity_meta[-1] url = entity_meta.description data = opengraph.OpenGraph(url=url) hypothesis_grant_token = current_app.hypothesis_client.grant_token( username=current_user.username) keyword = request.args.get('mark', None) return render_template( 'main/display_media.html', data=data, entity_meta=entity_meta, hypothesis_api_url=hypothesis_api_url, hypothesis_grant_token=hypothesis_grant_token.decode(), service_url=service_url)
def create(self, data): try: group = Group.objects.get(id=self.context['view'].kwargs.get("pk")) except Exception: raise PermissionError('Group Not found') current_user = self.context['request'].user data = self.context['request'].data return Post.objects.create( ogp=opengraph.OpenGraph( url=data["link"]).__str__() if 'link' in data else "", link=data.get("link", None), picture=data.get("picture", None), content=data["content"], group=group, creator=current_user, is_sponsored=(data.get("is_sponsored", 'false').strip().lower() == 'true'), is_announcement=(data.get("is_announcement", 'false').strip().lower() == 'true'))
def results(): try: want1 = request.form['keyword'] header = "http://" url = header + want1 print url f = urlopen(url).read() tree = etree.HTML(f) general = tree.xpath("//meta[@name='description']")[0].get("content") meta = opengraph.OpenGraph(url) print meta return render_template("results.html", data=meta, gog=general) except IOError: meta = {} general = "INvalid Url" return render_template("results.html", data=meta, gog=general) except Exception: meta = {} general = "these sites dont have a meta tags description they are THE BOSS!!!!!" return render_template("results.html", data=meta, gog=general)
async def watch(self, ctx, url: str, message="%(title)s by %(channelTitle)s just published!"): """Add a YouTube/Twitch channel to watch for new uploads. XXX Only works with YouTube channels right now. XXX Sort of works with livestreams on YouTube. The bot announces new video uploads, and YouTube treats livestreams like videos. XXX Doesn't allow you to change the Discord channel.""" if not url: await self.bot.responses.failure( title="No URL Specified", message="You need to give me a URL!") return og = opengraph.OpenGraph(url=url) channel_url = og.get('url', '') if channel_url.startswith("https://www.youtube.com/channel/"): self.checklist.append( YouTubeItem( self.youtube, channel_url.replace("https://www.youtube.com/channel/", ""), DiscordChannel(self.bot, ctx.message.channel.id, message + " %(url)s"))) await self.bot.responses.basic( message="This YouTube channel has been added!") elif False: pass else: await self.bot.responses.failure( title="Not a YouTube/Twitch Channel", message= "The URL you have given me is not a YouTube/Twitch channel!") return self._save()
def fetch_og_metadata(user_agent, col, remote_id): doc = col.find_one({'remote_id': remote_id}) if not doc: raise ValueError note = doc['activity']['object'] print(note) links = links_from_note(note) if not links: return 0 # FIXME(tsileo): set the user agent by giving HTML directly to OpenGraph htmls = [] for l in links: check_url(l) r = requests.get(l, headers={'User-Agent': user_agent}) r.raise_for_status() htmls.append(r.text) links_og_metadata = [ dict(opengraph.OpenGraph(html=html)) for html in htmls ] col.update_one({'remote_id': remote_id}, {'$set': { 'meta.og_metadata': links_og_metadata }}) return len(links)
def test_is_not_valid(self): og = opengraph.OpenGraph(url='http://vdubmexico.com') self.assertFalse(og.is_valid())
def test_required(self): og = opengraph.OpenGraph(url='http://grooveshark.com', required_attrs=("description",), scrape=True) self.assertTrue(og.is_valid())
def test_to_html(self): og = opengraph.OpenGraph(html=HTML) self.assertTrue(og.to_html())
def test_is_valid(self): og = opengraph.OpenGraph(url='http://grooveshark.com') self.assertTrue(og.is_valid())
def test_isinstace(self): data = opengraph.OpenGraph() self.assertTrue(isinstance(data,opengraph.OpenGraph))
def test_url(self): data = opengraph.OpenGraph(url='http://vimeo.com/896837') self.assertEqual(data.items['url'], 'http://vimeo.com/896837')
def scrap(url_): print('work!' + url_) og = opengraph.OpenGraph(url=url_) db.update(url_, og) print(og)
def test_no_json(self): opengraph.import_json = False og = opengraph.OpenGraph(url='http://grooveshark.com') self.assertEqual(og.to_json(), "{'error':'there isn't json module'}")
def fetch_og_metadata(user_agent, links): res = [] for l in links: # Try to skip media early mimetype, _ = mimetypes.guess_type(l) if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]: logger.info(f"skipping media link {l}") continue check_url(l) # Remove any AP objects try: lookup(l) continue except NotAnActivityError: pass except Exception: logger.exception( f"skipping {l} because of issues during AP lookup") continue try: h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3, allow_redirects=True) h.raise_for_status() except requests.HTTPError as http_err: logger.debug( f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}" ) continue except requests.RequestException as err: logger.debug(f"failed to HEAD {l}: {err!r}") continue if h.headers.get("content-type") and not h.headers.get( "content-type").startswith("text/html"): logger.debug(f"skipping {l} for bad content type") continue try: r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5, allow_redirects=True) r.raise_for_status() except requests.HTTPError as http_err: logger.debug( f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}" ) continue except requests.RequestException as err: logger.debug(f"failed to GET {l}: {err!r}") continue # FIXME(tsileo): check mimetype via the URL too (like we do for images) if not r.headers.get("content-type") or not r.headers.get( "content-type").startswith("text/html"): continue r.encoding = "UTF-8" html = r.text try: data = dict(opengraph.OpenGraph(html=html)) except Exception: logger.exception(f"failed to parse {l}") continue # Keep track of the fetched URL as some crappy websites use relative URLs everywhere data["_input_url"] = l u = urlparse(l) # If it's a relative URL, build the absolute version if "image" in data and data["image"].startswith("/"): data["image"] = u._replace(path=data["image"], params="", query="", fragment="").geturl() if "url" in data and data["url"].startswith("/"): data["url"] = u._replace(path=data["url"], params="", query="", fragment="").geturl() if data.get("url"): res.append(data) return res
def test_absolute(self): og = opengraph.OpenGraph(url='http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html', required_attrs=("image",), scrape=True) self.assertEqual(og.items["image"], "http://www.crummy.com/software/BeautifulSoup/bs3/6.1.jpg")
def test_to_json(self): og = opengraph.OpenGraph( url='http://www.youtube.com/watch?v=XAyNT2bTFuI') self.assertTrue(og.to_json()) self.assertTrue(isinstance(og.to_json(), str))