def collect_linter_previews(): linters = megalinter.utils.list_all_linters() # Read file with open(LINKS_PREVIEW_FILE, "r", encoding="utf-8") as json_file: data = json.load(json_file) updated = False # Collect info using web_preview for linter in linters: if (linter.linter_name not in data or megalinter.config.get( "REFRESH_LINTER_PREVIEWS", "false") == "true"): logging.info( f"Collecting link preview info for {linter.linter_name} at {linter.linter_url}" ) title, description, image = web_preview(linter.linter_url, parser="html.parser", timeout=1000) item = { "title": megalinter.utils.decode_utf8(title), "description": megalinter.utils.decode_utf8(description), "image": image, } data[linter.linter_name] = item updated = True # Update file if updated is True: with open(LINKS_PREVIEW_FILE, "w", encoding="utf-8") as outfile: json.dump(data, outfile, indent=4, sort_keys=True)
async def codex(self, context): offset = len("!senpai codex") mod_name = context.message.content[offset + 1:].strip() # check if user actually asked a question if (len(mod_name) == 0): await context.send("`Operator, what codex entry are looking for?`") return tmp_list = [elem.capitalize() for elem in mod_name.split()] mod_name = "_".join(elem for elem in tmp_list) mod_url = _WARFRAME_WIKIA_URL.format(mod_name) try: title, description, image_url = webpreview.web_preview(mod_url) embed_msg = discord.Embed(title=title, url=mod_url, color=COLOR) embed_msg.add_field(name="Description", value=description, inline=True) embed_msg.set_image(url=image_url) await context.send(embed=embed_msg) except Exception as e: print(repr(e)) await context.send( "`Operator, my codex does not seem to have an entry for this`")
def get_url_info(url): ''' Capture opengraph data from links. It tries to get everything from Facebook. TO DO: Have a default image when no image is found ''' token = settings.FACEBOOK_TOKEN fb_graph = GraphAPI(access_token=token, version='2.10') fb_info = fb_graph.get_object( id=quote(url), fields=['engagement,og_object{image,description,title,updated_time}']) if fb_info: try: return dict(thumbnail=fb_info['og_object']['image'][0]['url'], facebook_shares=fb_info['engagement']['share_count'], title=fb_info['og_object']['title'], description=fb_info['og_object']['description'], source=url.split('/')[2]) except KeyError: from webpreview import web_preview metadata = web_preview(url) return dict(thumbnail=metadata[2], facebook_shares=fb_info['engagement']['share_count'], title=metadata[0], description=metadata[1], source=url.split('/')[2]) else: return dict(thumbnail='', facebook_shares=0, title='', description='', source=url.split('/')[2])
def post(self, request, *args, **kwargs): content = request.POST['content'] try: oembed_preview = providers.request(content) except ProviderNotFoundException: oembed_preview = {} # Always generate web_preview as fallback for oembed_preview try: title, description, image = web_preview(content, parser='html.parser') except requests.exceptions.InvalidURL: return JsonResponse({'preview': None, 'title': None}) except webpreview.excepts.EmptyURL: return JsonResponse({'preview': None, 'title': None}) if 'html' in oembed_preview: preview_html = oembed_preview['html'] elif image: preview_html = f'<img src="{image}" alt="Website Preview" />' else: preview_html = '<div class="unavailable">Preview not available for this URL</div>' if 'title' in oembed_preview: title = oembed_preview['title'] return JsonResponse({'preview': preview_html, 'title': title})
async def yugioh(self, context): offset = len("!senpai yugioh") card_name = context.message.content[offset + 1:].strip() # check if user actually asked a question if (len(card_name) == 0): await context.send("`Usage: !senpai yugioh [card name]`") return tmp_list = card_name.split() card_name = "_".join(elem for elem in tmp_list) formatted_url = _YUGIOH_WIKIA_URL.format(card_name) print("url: ", formatted_url) try: title, description, image_url = webpreview.web_preview( formatted_url) embed_msg = discord.Embed(title=title, url=formatted_url, color=COLOR) embed_msg.add_field(name="Description", value=description, inline=True) embed_msg.set_image(url=image_url) await context.send(embed=embed_msg) except Exception as e: print(repr(e)) await context.send( "`KaibaCorp does not have any information on this card`")
def _get_site_title_and_description(url, content): try: logger.info('Getting metadata for {}'.format(url)) title, description, _ = web_preview(url, content=content) description = description[:450] + '...' if description and len(description) > 450 else description return title, description except: logger.exception('Could not get metadata for {}'.format(url)) return '', ''
def share(request): tags = [t[0] for t in Content.TAGS] if request.method == 'GET': return render(request, 'web/share.jinja2', {"tags": tags}) if request.method == 'POST': was_limited = getattr(request, 'limited', False) print(was_limited) if was_limited: return render( request, 'web/share.jinja2', { "error": "Please don't spam, you can share 2 URL every 5 minute. Thanks!", "tags": tags }) url = request.POST.get('url') tag = request.POST.get('tag') print("mama", tag) if not is_valid_url(url): return render(request, 'web/share.jinja2', { "error": "Not a valid URL!", "tags": tags }) if tag not in tags: return render(request, 'web/share.jinja2', { "error": "Enter valid tag!", "tags": tags }) if not is_domain_in_list(url): return render(request, 'web/share.jinja2', { "error": "This is not a supported URL!", "tags": tags }) title, desc, img_url = web_preview(url) if title and img_url: cntnt = Content(link=url, title=title, description=desc, image=img_url, tag=tag) cntnt.save() return render(request, 'web/share.jinja2', { "success": "URL added successfully", "tags": tags }) return render(request, 'web/share.jinja2', { "error": "URL cannot be parsed, please try another URL.", "tags": tags })
async def Ladd(self, ctx, link, *tags): """ params : link -> lien tag1/2/3 -> Les trois tags """ msg = "" authID = ctx.author.id chanName = ctx.channel.name lienAjoute = bool() for tag in tags: tag = tag.lower() if val.url(link): # Cas où ça marche title, description = "", "" if ".pdf" not in link: try: ret = web_preview(link, timeout=1) title, description = ret[0], ret[1] if description: description = description.replace("\"", "'") except: pass else: title, description = link.split("/")[len(link.split("/"))-1], "Fichier PDF a télécharger." lienAjoute = mdb.addLien(link, chanName, "??", authID, title, description) if tags: msg = "Lien ajouté avec les tags :" for tag in tags: tag_tmp = mdb.searchSynonymeByPrimKey(tag) if tag_tmp: tag = tag_tmp[0][2] mdb.addTag(tag, "", authID) mdb.addTagmap(link, tag) msg += " " + tag else: msg = "Lien ajouté sans tag" if not lienAjoute: msg = "Le lien existe déjà dans la base de donnée ou une erreur a eu lieu." else: "Le lien n'est pas conforme" await ctx.channel.send(msg)
def find_artist(update: Update, context: CallbackContext) -> None: data = update.message.text logger.info(f"data from user {data}") try: web_prev = web_preview(data) full = web_prev[1] artist = full[full.find("a song by") + len("a song by "):full.find(" on Spotify")] wiki_info = wikipedia.summary(artist) update.message.reply_text(wiki_info) except: update.message.reply_text( "Your link is incorect! Please provide a track link (you can get it from share button in spotify)" )
def parse_using_web(self): try: url_info = {} url_info["parser"] = "html_parser" title, description, image = web_preview(self.url, parser="html.parser") url_info['title'] = title url_info['description'] = description url_info['image'] = image return url_info except Exception as e: url_info['error'] = e return url_info
def save_images_from_rss(): # Needed to prevent bozo_exception if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context # Delete all old image files file_list = os.listdir(app.root_path + '/img/') for file in file_list: os.remove(app.root_path + '/img/' + file) # Image naming doesn't really matter, so we'll name it a number image_number = 0 # Loop through each rss feed for pub, feed_url in rss_feeds.items(): feed = feedparser.parse(feed_url) for post in range(POSTS_FROM_EACH_FEED): try: # Get share image and save to server title, description, image = web_preview( feed.entries[post]['link']) path = urlparse(image).path ext = os.path.splitext(path)[1] # If no extension save as jpg VALID_IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".gif"] if ext in VALID_IMAGE_EXTENSIONS: urllib.request.urlretrieve( image, app.root_path + '/img/' + str(image_number) + ext) elif not ext: urllib.request.urlretrieve( image, app.root_path + '/img/' + str(image_number) + '.jpg') image_number += 1 except: print('Could not get image') # Delete files that are not of a certain size file_list = os.listdir(app.root_path + '/img/') for file in file_list: if os.path.getsize(app.root_path + '/img/' + file) < MIN_IMAGE_SIZE * 1024: os.remove(app.root_path + '/img/' + file) return
def get_url_metadata(preview_link): title, description, image_url = web_preview(preview_link, parser='html.parser') favicon_url = get_favicon_url_from_url(preview_link) domain_url = get_url_domain(preview_link) if image_url is not None: image_url = make_proxy_image_url(image_url) if favicon_url is not None: favicon_url = make_proxy_image_url(favicon_url) return { 'title': title, 'description': description, 'image_url': image_url, 'favicon_url': favicon_url, 'domain_url': domain_url }
def geturl(stringsubmitted): headers = {'User-Agent': 'Mozilla/5.0'} splits = stringsubmitted.split() value = '' urltitle = '' urldescription = '' urlimage = '' for f in splits: try: if len(f) > 5: if f.endswith(('.jpg', '.png', '.gif', '.png', '.jpeg', '.JPG', '.webp')): pass else: if not f.lower().startswith(("http://", "https://")): f = 'https://' + f value = validators.url(f) mainurl = (checkers.is_url(value)) if mainurl is True: urltitle, urldescription, urlimage = web_preview( value, headers=headers) break else: value = '' urltitle = '' urldescription = '' urlimage = '' except: pass if value is None: value = '' if urltitle is None: urltitle = '' if urldescription is None: urldescription = '' if urlimage is None: urlimage = '' return value, urltitle, urldescription, urlimage
def get_articles(request): limit = int(request.GET['limit']) articles = Press.objects.order_by('-id')[:limit] body = [] for a in articles: art_dict = {} title = a.title description = a.description image = a.image site = a.site if not (title and description and image and site): title, description, image = web_preview(a.url) site = "/".join(a.url.split("/")[:3]) try: if "http" not in image: image = "{}/{}".format(site, image) except: pass a.title = title a.description = description a.site = site a.image = image a.save() art_dict['image'] = image print(a.url) art_dict['title'] = title art_dict['description'] = description art_dict['site'] = site.split("/")[-1] art_dict['url'] = a.url body.append(art_dict) data = { 'headers': { 'Content-Type': 'application/json', 'Access-Control-Allow-Origin': '*', }, 'body': body } return HttpResponse(simplejson.dumps(data))
def _get_article_image(self, article_q, final_q): """ Gets article link and other info, then gets URL for that article image """ try: work = True while work: data = article_q.get(block = True) if(data is None): work = False final_q.put(None) return else: source = data["source"] article_title = data["title"] article_summary = data["summary"] article_link = data["link"] t = data["time"] tc = TwitterCard(article_link, ["twitter:image"]) if(tc.image is None): _, _, image = web_preview(article_link, parser = "html.parser") if(image is None): if(source == "standard_agrix"): img = "https://www.farmers.co.ke/assets/images/logo.png" elif(source == "standard_biz"): img = "https://www.standardmedia.co.ke/common/i/standard-digital-world-inner-page.png" elif(source == "business_daily"): img = "https://www.businessdailyafrica.com/image/view/-/3818190/medRes/1349497/-/3ijc6bz/-/logoNew.png" else: img = image else: img = tc.image fin_dict = {"title": article_title, "summary": article_summary, "link": article_link, "source": source, "image": img, "time": t} final_q.put(fin_dict) except Exception as e: raise RuntimeError(e)
def handle(self, *args, **options): with open( os.path.dirname(os.path.realpath(__file__)) + os.sep + "rss_url_list.txt", 'r') as r_f: for data in r_f: if not data or data == '\n': continue url_data = re.split(r'\t+', data.rstrip()) raw_data = reader.read(url_data[1]) for line in raw_data: link = unquote(line[2]) m = re.search( 'https://www.google.com/url?.*url=(' 'https://.*)&ct=ga&cd=.*', link) if m: link = m.group(1) try: title, description, image = web_preview(link) if image is None: image = "https://www.freeiconspng.com/uploads/no-image-icon-6.png" try: tag = Tag.objects.get(tag_text=url_data[0]) except Tag.DoesNotExist: tag = Tag(tag_text=url_data[0]) tag.save() article = Article(pub_date=timezone.now(), title_text=line[0], summary_text=line[1], link_text=link, image_url=image, state=0) article.save() article.tags.add(tag) self.stdout.write( self.style.SUCCESS( 'Successfully saved article. title: "%s"' % article.title_text)) except Exception as e: print(e)
# coding: utf-8 # In[6]: from webpreview import web_preview import bs4 as BeautifulSoup # In[8]: url = "http://www.streetpress.com/sujet/1488190869-tribunal-de-l-armee" title, description, image = web_preview(url) # In[7]: template = """<html><head><meta name="twitter:card" content="" /> <meta name="twitter:site" content="" /> <meta name="twitter:title" content="%s" /> <meta name="twitter:description" content="%s" /> <meta name="twitter:image" content="%s" /> <meta property="og:type" content="website"> <meta property="og:title" content="%s"> <meta property="og:description" content="%s"> <meta property="og:url" content="%s"> <meta property="og:image" content="%s"> </head><body></body></html>"""%(title,description,image,title,description,url,image) f = open('index.html', 'w') f.write(template) f.close()
def web_preview_link(link): return web_preview(link)
from webpreview import web_preview from pprint import pprint url = 'auctorial.com' meta = web_preview(url) pprint(meta)
def resolve_new_activities(self, info, **kwargs): import json email = kwargs.get('email') if email and BoredUser.objects.filter(email=email).exists(): import requests import random from webpreview import web_preview prob = random.random() user = BoredUser.objects.get(email=email) weights = user.category_weights.split(',') weights = [int(x) for x in weights] fav_category = weights.index(max(weights)) # print(fav_category) CATEGORIES = [ "education", "recreational", "social", "diy", "charity", "cooking", "relaxation", "music", "busywork" ] from google import google if prob > 0.6: resp = requests.get("https://www.boredapi.com/api/activity/") else: resp = requests.get( "https://www.boredapi.com/api/activity?type=" + CATEGORIES[fav_category]) resp = json.loads(resp.text) from googleapiclient.discovery import build API_KEY = "AIzaSyDIFF1lEqsuX-9vd-W8cUu1unZH5oeQe4s" CSE_ID = "010456897677353178205:0mffw7ezvwy" def google_search(search_term, api_key, cse_id, **kwargs): service = build("customsearch", "v1", developerKey=api_key) res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() return res # result = google_search(resp['activity'], API_KEY, CSE_ID) num_page = 1 result = google.search(resp['activity'], num_page) activities = [] for search_results in result[:6]: activity = Activity() activity.name = resp['activity'] activity.category = resp['type'] activity.participants = int(resp['participants']) activity.key = resp['key'] activity.title = search_results.name activity.link = search_results.link activity.description = search_results.description img_url = web_preview(activity.link)[2] # img_url = "" activity.thumb = img_url if img_url else '' activity.save() activities.append(activity) user.last_activity = activity user.save() return activities
if image is None: image = "https://cdn.crabber.net/img/avatar.jpg" return title, description, image with Lock("fetch-cards") as lock: if lock: app.app_context().push() for card in Card.query_unready(): try: metadata = web_preview( # Redirect Twitter to Nitter (they've started requiring # javascript... so dumb.) card.url.replace( "https://twitter.com", "https://nitter.actionsack.com" ), timeout=2, ) if metadata: card.title, card.description, card.image = metadata if card.title is not None: if all( [ not card.title.startswith(error_code + " ") for error_code in ("404", "403", "500") ] ): card.ready = True print(f"Fetched {card.url}") except (URLUnreachable, URLNotFound, RequestException, KeyError):