def test_open_graph_all_properties(self): url = 'http://lassie.it/open_graph/all_properties.html' data = lassie.fetch(url) self.assertEqual(data['url'], url) self.assertEqual(data['title'], 'Lassie Open Graph All Properies Test') self.assertEqual(data['description'], 'Just a test template with OG data!') self.assertEqual(data['locale'], 'en_US') self.assertEqual(len(data['images']), 1) image = data['images'][0] self.assertEqual(image['src'], 'http://i.imgur.com/cvoR7zv.jpg') self.assertEqual(image['width'], 550) self.assertEqual(image['height'], 365) self.assertEqual(image['type'], 'og:image') self.assertEqual(len(data['videos']), 1) video = data['videos'][0] self.assertEqual( video['src'], 'http://www.youtube.com/v/dQw4w9WgXcQ?version=3&autohide=1') self.assertEqual(video['width'], 640) self.assertEqual(video['height'], 480) self.assertEqual(video['type'], 'application/x-shockwave-flash')
def test_open_graph_no_og_title_no_og_url(self): url = 'http://lassie.it/open_graph/no_og_title_no_og_url.html' data = lassie.fetch(url) self.assertEqual(data['url'], url) self.assertEqual(data['title'], 'Lassie Open Graph Test | No og:title, No og:url')
def test_generic_favicon(self): url = 'http://lassie.it/generic/favicon.html' data = lassie.fetch(url) self.assertEqual(len(data['images']), 1) image = data['images'][0] self.assertEqual(image['type'], 'favicon')
def test_generic_favicon(self): url = "http://lassie.it/generic/favicon.html" data = lassie.fetch(url) self.assertEqual(len(data["images"]), 1) image = data["images"][0] self.assertEqual(image["type"], "favicon")
def test_twitter_no_og_title_use_twitter_title(self): url = 'http://lassie.it/twitter_card/no_og_title_use_twitter_title.html' data = lassie.fetch(url) self.assertEqual(data['description'], 'A test case for Lassie!') self.assertEqual( data['title'], 'Lassie Twitter Test | no_og_title_use_twitter_title')
def test_generic_all_properties(self): url = 'http://lassie.it/generic/all_properties.html' data = lassie.fetch(url) self.assertEqual(data['locale'], 'en_US') self.assertEqual(data['title'], 'Lassie Generic Test | all_properties') self.assertEqual(data['description'], 'Just a random description of a web page.') self.assertEqual(len(data['keywords']), 5)
def test_generic_all_properties(self): url = "http://lassie.it/generic/all_properties.html" data = lassie.fetch(url) self.assertEqual(data["locale"], "en_US") self.assertEqual(data["title"], "Lassie Generic Test | all_properties") self.assertEqual(data["description"], "Just a random description of a web page.") self.assertEqual(len(data["keywords"]), 5)
def test_open_graph_og_image_plus_two_body_images(self): url = "http://lassie.it/open_graph/og_image_plus_two_body_images.html" data = lassie.fetch(url) # Try without passing "all_images", then pass it self.assertEqual(len(data["images"]), 1) data = lassie.fetch(url, all_images=True) self.assertEqual(len(data["images"]), 3) image_0 = data["images"][0] image_1 = data["images"][1] image_2 = data["images"][2] self.assertEqual(image_0["type"], "og:image") self.assertEqual(image_1["type"], "body_image") self.assertEqual(image_2["type"], "body_image")
def test_open_graph_og_image_plus_two_body_images(self): url = 'http://lassie.it/open_graph/og_image_plus_two_body_images.html' data = lassie.fetch(url) # Try without passing "all_images", then pass it self.assertEqual(len(data['images']), 1) data = lassie.fetch(url, all_images=True) self.assertEqual(len(data['images']), 3) image_0 = data['images'][0] image_1 = data['images'][1] image_2 = data['images'][2] self.assertEqual(image_0['type'], 'og:image') self.assertEqual(image_1['type'], 'body_image') self.assertEqual(image_2['type'], 'body_image')
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) try: info = lassie.fetch(self.url) except lassie.LassieError: pass else: self.title = info.get('title') self.description = info.get('description')
def test_image_file(self): url = 'http://lassie.it/handle_file_content/image_file.jpg' data = lassie.fetch(url, handle_file_content=True) self.assertEqual(data['url'], url) self.assertEqual(data['title'], 'image_file.jpg') self.assertEqual(len(data['images']), 1) image = data['images'][0] self.assertEqual(image['src'], 'http://lassie.it/handle_file_content/image_file.jpg') self.assertEqual(image['type'], 'body_image')
def Lassie(): import lassie k = lassie.fetch("https://www.sitebuilderreport.com/stock-up", all_images=True, handle_file_content=True) print(k) #Lassie() #----------------------------------------------------------------------------------------------------------------------#
def test_twitter_all_properties(self): url = 'http://lassie.it/twitter_card/all_properties.html' data = lassie.fetch(url) self.assertEqual(data['url'], 'http://www.youtube.com/watch?v=fWNaR-rxAic') self.assertEqual(data['title'], 'Carly Rae Jepsen - Call Me Maybe') self.assertEqual(data['description'], 'Buy Now! http://smarturl.it/CallMeMaybe Music video by Carly Rae Jepsen performing Call Me Maybe. (C) 2011 604 Records Inc. #VEVOCertified on June 8, 2012. h...') self.assertEqual(len(data['images']), 1) image = data['images'][0] self.assertEqual(image['src'], 'http://i1.ytimg.com/vi/fWNaR-rxAic/maxresdefault.jpg') self.assertEqual(len(data['videos']), 1) video = data['videos'][0] self.assertEqual(video['src'], 'https://www.youtube.com/embed/fWNaR-rxAic') self.assertEqual(video['width'], 1920) self.assertEqual(video['height'], 1080)
def render_nostyle(url): resp = redis_cache.get(url) print resp, type(resp) if not resp: resp = lassie.fetch(url) resp = json.dumps(resp) r = redis_cache.set(url, resp) else: resp = json.loads(resp) thumbnail = resp.get('images') if thumbnail: thumbnail = thumbnail[0].get('src') title = resp.get('title') description = resp.get('description') parse_obj = urlparse(url) provider = parse_obj.netloc return render_template('article.html', _url = url, image = thumbnail, title = title, description = description, provider=provider)
def test_twitter_all_properties(self): url = "http://lassie.it/twitter_card/all_properties.html" data = lassie.fetch(url) self.assertEqual(data["url"], "http://www.youtube.com/watch?v=fWNaR-rxAic") self.assertEqual(data["title"], "Carly Rae Jepsen - Call Me Maybe") self.assertEqual( data["description"], "Buy Now! http://smarturl.it/CallMeMaybe Music video by Carly Rae Jepsen performing Call Me Maybe. (C) 2011 604 Records Inc. #VEVOCertified on June 8, 2012. h...", ) self.assertEqual(len(data["images"]), 1) image = data["images"][0] self.assertEqual(image["src"], "http://i1.ytimg.com/vi/fWNaR-rxAic/maxresdefault.jpg") self.assertEqual(len(data["videos"]), 1) video = data["videos"][0] self.assertEqual(video["src"], "https://www.youtube.com/embed/fWNaR-rxAic") self.assertEqual(video["width"], 1920) self.assertEqual(video["height"], 1080)
def get_content(url): if redis_cache.get(url): resp = redis_cache.get(url) resp = json.loads(resp.decode('utf-8')) print resp, "item loaded from cache" else: resp = cli.oembed(url, raw=True, words = 30) if resp.get('raw'): r = redis_cache.set(url, resp.get('raw')) print 'item cached:', r else: resp = lassie.fetch(url) j = json.dumps(resp) r = redis_cache.set(url, j) print resp if resp: if resp.get("provider_name") == "Twitter": return render_twitter(url) elif resp.get("provider_name") == "YouTube": return render_youtube(resp.get("html")) elif resp.get("type") == "rich": print "rich" ratio = (float(resp.get("height",1))/resp.get("width",1))*100 print ratio if ratio <= 0: ratio = 100 return render_template("video.html", content = resp.get("html"), ratio = str(ratio)) elif resp.get("type") == "video": print "video" ratio = (float(resp.get("height", 1))/resp.get("width",1))*100 print ratio if ratio <= 0: ratio = 100 return render_template("video.html", content = resp.get("html"), ratio = ratio) elif resp.get("type") == "link": return render_template("article.html", title = resp.get("title"), image=resp.get("thumbnail_url"), description = resp.get("description"), _url=resp.get("url"), provider = resp.get("provider_name")) elif resp.get("type") == "photo": print "photo" return render_template("photo.html", _url = str(resp.get("url")), source = url) else: return render_nostyle(url) else: return render_nostyle(url)
def analyze(request): if request.method == 'POST': q = Article(article_url=request.POST.get("article_url", ""), pub_date=timezone.now()) q.save() id = q.id p = Article.objects.get(pk=id) p.result_set.all() p.result_set.create(author='Sith', recommend=True, rating=3) url = request.POST.get("article_url", "") data = lassie.fetch(url) domain = get_domain(url) # get labels/tags associated with any sketchy websites label_results = result_data(domain) source_descr = '' is_in_db = '' # Note whether site is reliable or not if label_results == {}: #Add statement that site isn't known to be unreliable source_descr = result_descr(0) is_in_db = 0 else: # Add statement that site is unreliable source_descr = result_descr(1) is_in_db = 1 # key-value pairs extract_videos(url) alllabelinfo = label_info() return render( request, 'main/results.html', { 'data': data, 'id': id, 'domain': domain, 'alllabelinfo': alllabelinfo, 'labelinfo': label_results, 'sourcedescr': source_descr, 'indb': is_in_db })
def test_open_graph_all_properties(self): url = "http://lassie.it/open_graph/all_properties.html" data = lassie.fetch(url) self.assertEqual(data["url"], url) self.assertEqual(data["title"], "Lassie Open Graph All Properies Test") self.assertEqual(data["description"], "Just a test template with OG data!") self.assertEqual(data["locale"], "en_US") self.assertEqual(len(data["images"]), 1) image = data["images"][0] self.assertEqual(image["src"], "http://i.imgur.com/cvoR7zv.jpg") self.assertEqual(image["width"], 550) self.assertEqual(image["height"], 365) self.assertEqual(image["type"], "og:image") self.assertEqual(len(data["videos"]), 1) video = data["videos"][0] self.assertEqual(video["src"], "http://www.youtube.com/v/dQw4w9WgXcQ?version=3&autohide=1") self.assertEqual(video["width"], 640) self.assertEqual(video["height"], 480) self.assertEqual(video["type"], "application/x-shockwave-flash")
def test_open_graph_all_properties(self): url = 'http://lassie.it/open_graph/all_properties.html' data = lassie.fetch(url) self.assertEqual(data['url'], url) self.assertEqual(data['title'], 'Lassie Open Graph All Properies Test') self.assertEqual(data['description'], 'Just a test template with OG data!') self.assertEqual(data['locale'], 'en_US') self.assertEqual(len(data['images']), 1) image = data['images'][0] self.assertEqual(image['src'], 'http://i.imgur.com/cvoR7zv.jpg') self.assertEqual(image['width'], 550) self.assertEqual(image['height'], 365) self.assertEqual(image['type'], 'og:image') self.assertEqual(len(data['videos']), 1) video = data['videos'][0] self.assertEqual(video['src'], 'http://www.youtube.com/v/dQw4w9WgXcQ?version=3&autohide=1') self.assertEqual(video['width'], 640) self.assertEqual(video['height'], 480) self.assertEqual(video['type'], 'application/x-shockwave-flash')
def parse(url): data = lassie.fetch(url) return data
def test_open_graph_og_image_relative_url(self): url = "http://lassie.it/open_graph/og_image_relative_url.html" data = lassie.fetch(url) self.assertEqual(data["images"][0]["src"], "http://lassie.it/open_graph/name.jpg")
def test_canonical(self): url = 'http://lassie.it/generic/canonical.html' data = lassie.fetch(url, canonical=True) self.assertEqual(data['url'], 'http://example.com/canonical/path')
def test_no_title(self): url = 'http://lassie.it/generic/no_title.html' data = lassie.fetch(url) self.assertTrue(not 'title' in data)
def test_open_graph_og_image_relative_url(self): url = 'http://lassie.it/open_graph/og_image_relative_url.html' data = lassie.fetch(url) self.assertEqual( data['images'][0]['src'], 'http://lassie.it/open_graph/name.jpg')
def test_twitter_no_og_title_use_twitter_title(self): url = "http://lassie.it/twitter_card/no_og_title_use_twitter_title.html" data = lassie.fetch(url) self.assertEqual(data["description"], "A test case for Lassie!") self.assertEqual(data["title"], "Lassie Twitter Test | no_og_title_use_twitter_title")
def __init__(self, ssb_config): super(SSBWindow, self).__init__() # Load Settings self.config = ssb_config self.settings = QtCore.QSettings("pyssb", self.config['name']) # Window Position self.setWindowTitle(self.config['title']) try: self.restoreGeometry(self.settings.value("geometry")) except: # Reasonable Defaults self.resize(900, 600) self.move(100, 100) ### Extra Browser Functionality self.search = SearchWidget(self) self.showSearch = QtWidgets.QShortcut("Ctrl+F", self, activated = lambda: self.search.toggleSearch()) self.quit = QtWidgets.QShortcut("Ctrl+Q", self, activated = self.close) self.zoomIn = QtWidgets.QShortcut("Ctrl++", self, activated = lambda: self.setZoomFactor(self.zoomFactor()+.2)) self.zoomIn2 = QtWidgets.QShortcut("Ctrl+=", self, activated = lambda: self.setZoomFactor(self.zoomFactor()+.2)) self.zoomIn = QtWidgets.QShortcut("Ctrl+-", self, activated = lambda: self.setZoomFactor(self.zoomFactor()-.2)) self.zoomOne = QtWidgets.QShortcut("Ctrl+0", self, activated = lambda: self.setZoomFactor(1)) ### # Icon if not self.settings.value("icon"): try: try: favicon_url = self.config['favicon_url'] except: p = lassie.fetch(self.config['url']) for i in p['images']: if i['type'] == 'favicon': favicon_url = i['src'] break r = urlopen(favicon_url) f = r.read() self.settings.setValue('icon', f) except: f = None else: f = self.settings.value('icon') icon_img = QtGui.QImage.fromData(f) icon_pix = QtGui.QPixmap.fromImage(icon_img) self.setWindowIcon(QtGui.QIcon(icon_pix)) # Cookie Jar self.cookiejar = QtNetwork.QNetworkCookieJar(self) self.page().networkAccessManager().setCookieJar(self.cookiejar) # Load Cookies try: raw_cookies = pickle.loads(self.settings.value("cookies")) cookies = [] for r in raw_cookies: c = QtNetwork.QNetworkCookie.parseCookies(r) cookies.append(c[0]) self.cookiejar.setAllCookies(cookies) except: pass # Load Page self.load(QtCore.QUrl(self.config['url']))
def test_twitter_no_og_title_use_twitter_title(self): url = 'http://lassie.it/twitter_card/no_og_title_use_twitter_title.html' data = lassie.fetch(url) self.assertEqual(data['description'], 'A test case for Lassie!') self.assertEqual(data['title'], 'Lassie Twitter Test | no_og_title_use_twitter_title')
def test_open_graph_og_image_relative_url(self): url = 'http://lassie.it/open_graph/og_image_relative_url.html' data = lassie.fetch(url) self.assertEqual(data['images'][0]['src'], 'http://lassie.it/open_graph/name.jpg')
def test_no_title(self): url = "http://lassie.it/generic/no_title.html" data = lassie.fetch(url) self.assertTrue(not "title" in data)
def test_generic_bad_locale(self): url = 'http://lassie.it/generic/bad_locale.html' data = lassie.fetch(url) self.assertTrue(not 'locale' in data)
def test_generic_bad_locale(self): url = "http://lassie.it/generic/bad_locale.html" data = lassie.fetch(url) self.assertTrue(not "locale" in data)
import lassie from pprint import pprint sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample) print("*" * 100) sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8', all_images=True) print(sample) pprint(sample) print("*" * 100) from lassie import Lassie l = Lassie() sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample) print("*" * 100) l.request_opts = { 'headers': { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) ' 'Version/12.1.1 Safari/605.1.15 '
def __init__(self, URL): """ This method initializes the URL in the class :type URL: string :param URL: The URL of the Website """ self.URL = URL try: # initialize NewsPlease news_please_article = NewsPlease.from_url(self.URL) # set title self.title = news_please_article.title # set content self.content = news_please_article.text # if the content retreived is null raise an exception that would migrate the crawler from news please and lasse to goose if (len(self.content) == 0): raise Exception # set meta keywords self.meta_keywords = lassie.fetch(self.URL)["keywords"] # set meta description self.meta_description = news_please_article.description # top image url self.top_img_url = news_please_article.image_url except Exception as exception: highlight_back( "[Crawler] Crawler migrated from News-Please and Lassie to Goose due to an exception: {}" .format(exception), 'G') line_loc() try: # initialize Goose goose = Goose() # initialize the goose article object goose_article = goose.extract(self.URL) # assign title self.title = goose_article.title # assign content self.content = goose_article.cleaned_text # if the content retreived is null raise an exception that would migrate the crawler from goose to news please and lasse if (len(self.content) == 0): raise Exception # assign meta keywords (str) and split it to form a list self.meta_keywords = goose_article.meta_keywords.split(',') # assign meta description self.meta_description = goose_article.meta_description # top image url self.top_img_url = '' except Exception as exception: highlight_back( "[Crawler] An exception has occured in Goose: {}".format( exception), 'R') line_loc()
def __init__(self, ssb_config): super(SSBWindow, self).__init__() # Load Settings self.config = ssb_config self.settings = QtCore.QSettings("pyssb", self.config['name']) # Window Position self.setWindowTitle(self.config['title']) try: self.restoreGeometry(self.settings.value("geometry")) except: # Reasonable Defaults self.resize(900, 600) self.move(100, 100) ### Extra Browser Functionality self.search = SearchWidget(self) self.showSearch = QtWidgets.QShortcut( "Ctrl+F", self, activated=lambda: self.search.toggleSearch()) self.quit = QtWidgets.QShortcut("Ctrl+Q", self, activated=self.close) self.zoomIn = QtWidgets.QShortcut( "Ctrl++", self, activated=lambda: self.setZoomFactor(self.zoomFactor() + .2)) self.zoomIn2 = QtWidgets.QShortcut( "Ctrl+=", self, activated=lambda: self.setZoomFactor(self.zoomFactor() + .2)) self.zoomIn = QtWidgets.QShortcut( "Ctrl+-", self, activated=lambda: self.setZoomFactor(self.zoomFactor() - .2)) self.zoomOne = QtWidgets.QShortcut( "Ctrl+0", self, activated=lambda: self.setZoomFactor(1)) ### # Icon if not self.settings.value("icon"): try: try: favicon_url = self.config['favicon_url'] except: p = lassie.fetch(self.config['url']) for i in p['images']: if i['type'] == 'favicon': favicon_url = i['src'] break r = urlopen(favicon_url) f = r.read() self.settings.setValue('icon', f) except: f = None else: f = self.settings.value('icon') icon_img = QtGui.QImage.fromData(f) icon_pix = QtGui.QPixmap.fromImage(icon_img) self.setWindowIcon(QtGui.QIcon(icon_pix)) # Cookie Jar self.cookiejar = QtNetwork.QNetworkCookieJar(self) self.page().networkAccessManager().setCookieJar(self.cookiejar) # Load Cookies try: raw_cookies = pickle.loads(self.settings.value("cookies")) cookies = [] for r in raw_cookies: c = QtNetwork.QNetworkCookie.parseCookies(r) cookies.append(c[0]) self.cookiejar.setAllCookies(cookies) except: pass # Load Page self.load(QtCore.QUrl(self.config['url']))
def test_open_graph_no_og_title_no_og_url(self): url = "http://lassie.it/open_graph/no_og_title_no_og_url.html" data = lassie.fetch(url) self.assertEqual(data["url"], url) self.assertEqual(data["title"], "Lassie Open Graph Test | No og:title, No og:url")