def og_scrape_worker(canonical_url): # Get open graph data try: og_data = OpenGraph(url="http://%s" % canonical_url) if og_data.is_valid(): return add_data_to_url_in_db(canonical_url, og_data) else: update_to_error_url_status_in_db(canonical_url) except Exception as e: print "Error occurred on scrap worker thread: %s" % e update_to_error_url_status_in_db(canonical_url)
def __fetch_open_graph_details(self): try: og = OpenGraph(url=self.url) if og.is_valid(): _json = json.loads(og.to_json()) self.title = self.title or _json['title'] self.description = self.description or _json['description'] self.image_url = self.image_url or _json['image'] except Exception as e: print(e.__doc__) print(e.message) return
def __fetch_open_graph_details(self): try: og = OpenGraph(url=self.url) if og.is_valid(): _json = json.loads(og.to_json()) self.name = _json['title'] img = Image(user=self.user, large_url=_json['image'], added_time=timezone.now()) img.save() self.image = img except Exception as e: print(e.__doc__) print(e.message) return
def get_url_data(url): try: raw = urlopen(url) if raw is None: return None html = raw.read() data = OpenGraph(html=html, scrape=True) if data.is_valid(): return dict(data) else: return None except (HTTPError, URLError, AttributeError): return None
def _parse_open_graph(article): og = OpenGraph(html=article.html) if not og.is_valid(): return if og["type"] != "article": raise NotImplementedError("Cannot parse a OG type: %s" % og["type"]) og.setdefault(None) article.title = article.title or og.get("title") article.summary = article.summary or og.get("description") article.images = article.images or [og.get("image")] article.meta_lang = article.meta_lang or og.get("locale") article.keywords = article.keywords or og.get("tag") article.categories = article.categories or [og.get("category")] article.authors = article.authors or [og.get("author")] article.pub_date = article.pub_date or og.get("modified_date")