def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the 'regex's. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def trigger_w(self, msg): "Usage: w <search term>. Prints a short description of the corresponding wikipedia article." if len(msg.args) == 0: self.bot.notice(msg.nick, "Please specify a search term") return params = { 'action': 'opensearch', 'format': 'xml', 'limit': '2', 'search': ' '.join(msg.args) } url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language) response = BeautifulStoneSoup(requests.post(url, data=params).text) # Damn BS4 is case sensitive, hence all the regex. if response.find(re.compile('text', re.I)): index = 0 if "may refer to:" in response.find(re.compile('description', re.I)).string: index = 1 info = response.find_all(re.compile('description', re.I))[index].string.strip() url = response.find_all(re.compile('url', re.I))[index].string short_url = self.shorten(url) message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url) self.bot.privmsg(msg.channel, message) else: self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))
def run(self): while True: try: maxid = self.db.news_list()[0]['id'] except: maxid = 1 print(maxid) client = HTTPClient() response = client.fetch('http://cs.hust.edu.cn/rss') result = response.body.decode("utf-8",errors='ignore') soup = BeautifulStoneSoup(result) items = soup.find_all('item') for item in items: title = item.title.text link = item.link.text desc = item.description.text linkid = self.link_id(link) if linkid > maxid: result = self.db.add_news(linkid,title,desc,link) if result: result = self.get_article(link) else: break time.sleep(3600)
def _parse_html(self, url): ''' Метод загружает страницу из url и обрабатывает все ссылки на ней присутствующие. Рекурсивно вызывает самого себя для первой из еще не обработанных ссылок, т.о. парсится весь сайт. ''' html = None page_content = None self._processed.add(url) self._recursion_counter += 1 try: with urllib.request.urlopen(url) as response: html = response.read() except Exception: html = None print('Unable to load url %s' % url) if html: try: page_content = Soup(html) except Exception: page_content = None if page_content: stop_list = ('#', '', '/') for a in page_content.find_all('a', href=True): if a['href'] not in stop_list: href = self._build_link(url=a['href'], location_parts=urlparse(url)) if href: try: self._url_validator(href) except ValidationError: print('%s is not valid url' % href) else: self._finds.add(href) self._add_location(url) unprocessed = self._finds - self._processed print('Всего страниц: %s. Обработано страниц: %s. Найдено рецептов: %s. Последний URL: %s' % (len(self._finds), len(self._processed), len(self._urls), url)) # На каждом 20-ом вызове данного метода сохраняем self._urls if self._recursion_counter % 20: self._save() self._recursion_counter = 0 if unprocessed: if self.sleep_time > 0: time.sleep(self.sleep_time) next_url = list(unprocessed)[0] self._parse_html(next_url)
def get_all_urls(self): """Возвращает список url""" list_of_urls = [] for url in self.urls: request = self.session.get(url, headers=self.headers) soap = BeautifulStoneSoup(request.content) urls = soap.find_all('loc') list_of_urls += [url.next_element for url in urls] return list_of_urls
def render(self, context): fancount = '' fb_api_url = 'http://api.facebook.com/restserver.php' tw_api_url = 'http://api.twitter.com' cache_key = '' cache_time = 1800 if self.service == "facebook": query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s' xml_path = query % (fb_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('page') for node in nodes: fancount = node.fan_count.string cache.set(cache_key, fancount, cache_time) except: pass if self.service == "twitter": query = "%s/1/users/show/%s.xml" xml_path = query % (tw_api_url, self.service_id) cache_key = md5(xml_path.encode()).hexdigest() fancount = cache.get(cache_key) if not fancount: try: xml = urlopen(xml_path) content = xml.read() soup = BeautifulStoneSoup(content) nodes = soup.find_all('user') for node in nodes: fancount = node.followers_count.string cache.set(cache_key, fancount, cache_time) except: pass return fancount
def parse(word: str, soup: BeautifulStoneSoup) -> dict: entries = [] word = {'word': word, 'entries': entries} for entry in soup.find_all(class_='ldoceEntry Entry'): entries.append({}) last_entry = entries[-1] with suppress(AttributeError): american_pron = entry.find(class_='AMEVARPRON') american = f'/{american_pron.text.strip()}/' if american_pron else '' last_entry['pron'] = '/{english}/ {american}'.format( english=entry.find(class_='PRON').text.strip(), american=american, ).rstrip() try: last_entry['pos'] = entry.find(class_='POS').text.strip() except AttributeError: entries.pop() continue senses = last_entry['senses'] = [] for sense in entry.find_all(class_='Sense'): senses.append({}) last_sense = senses[-1] try: last_sense['definition'] = sense.find( class_='DEF').text.strip() except AttributeError: try: last_sense['definition'] = sense.find( class_='REFHWD').text.strip() except AttributeError: senses.pop() continue find_rel = sense.find(class_='RELATEDWD') if find_rel: last_sense['rel'] = find_rel.text.strip()[2:] find_syn = sense.find(class_='SYN') if find_syn: last_sense['syn'] = find_syn.text.strip()[4:] find_opp = sense.find(class_='OPP') if find_opp: last_sense['opp'] = find_opp.text.strip()[4:] last_sense['examples'] = [ e.text.strip() for e in sense.find_all(class_='EXAMPLE') ] return word
def run(self, file_name, user, **kwargs): """ Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects. """ f = open(file_name, 'r') xml = f.read() f.close() soup = BeautifulStoneSoup(xml) items = soup.find_all('item') for item in items: post_type = item.find('wp:post_type').string post_status = item.find('wp:status').string if post_type == 'attachment': get_media(item, user) # Note! This script assumes all the attachments come before # posts and pages in the xml. If this ends up changing, # do two loops, one with attachments and the second with posts and pages. elif post_type == 'post' and post_status == 'publish': get_posts(item, user) elif post_type == 'page' and post_status == 'publish': get_pages(item, user) if user.email: context = { 'SITE_GLOBAL_SITEDISPLAYNAME': get_setting('site', 'global', 'sitedisplayname'), 'SITE_GLOBAL_SITEURL': get_setting('site', 'global', 'siteurl'), } subject = ''.join( render_to_string( template_name=('notification/wp_import/short.txt'), context=context).splitlines()) body = render_to_string( template_name=('notification/wp_import/full.html'), context=context) #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False) email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email]) email.content_subtype = 'html' email.send(fail_silently=True)
def test_consent_block_snippet_injection(rf): """ Test that the GDPR consent is required to inject xtheme scripts """ shop = factories.get_default_shop() client = SmartClient() index_url = reverse("shuup:index") # create a GDPR setting for the shop shop_gdpr = GDPRSettings.get_for_shop(shop) shop_gdpr.cookie_banner_content = "my cookie banner content" shop_gdpr.cookie_privacy_excerpt = "my cookie privacyexcerpt" shop_gdpr.enabled = True shop_gdpr.save() # configure some snippets to be injected google_snippet = Snippet.objects.create( name="Google Analytics", snippet_type=SnippetType.InlineHTMLMarkup, location="body_end", shop=shop, snippet='<script id="google-script"></script>', ) facebook_snippet = Snippet.objects.create( name="Facebook Pixel", snippet_type=SnippetType.InlineHTMLMarkup, location="body_end", shop=shop, snippet='<script id="facebook-script"></script>', ) # create cookie categories required_cookie_category = GDPRCookieCategory.objects.create( shop=shop, always_active=True, cookies="cookie1,cookir2,_cookie3", name="RequiredCookies", how_is_used="to make the site work", ) google_cookie_category = GDPRCookieCategory.objects.create( shop=shop, always_active=False, cookies="_google", name="GoogleCookies", how_is_used="to spy users", ) google_cookie_category.block_snippets.add(google_snippet) faceboook_cookie_category = GDPRCookieCategory.objects.create( shop=shop, always_active=False, cookies="_facebook", name="Facebook", how_is_used="to track users", ) faceboook_cookie_category.block_snippets.add(facebook_snippet) # create privacy policy GDPR document ensure_gdpr_privacy_policy(shop) response = client.get(index_url) assert settings.SHUUP_GDPR_CONSENT_COOKIE_NAME not in response.cookies # send consent only for the required and google response = client.post( reverse("shuup:gdpr_consent"), data={ "cookie_category_{}".format(required_cookie_category.id): "on", "cookie_category_{}".format(google_cookie_category.id): "on", "cookie_category_{}".format(faceboook_cookie_category.id): "off", }, ) assert settings.SHUUP_GDPR_CONSENT_COOKIE_NAME in response.cookies cookies_data = json.loads( response.cookies[settings.SHUUP_GDPR_CONSENT_COOKIE_NAME].value) for cookie in required_cookie_category.cookies.split(","): assert cookie in cookies_data["cookies"] for cookie in google_cookie_category.cookies.split(","): assert cookie in cookies_data["cookies"] for cookie in faceboook_cookie_category.cookies.split(","): assert cookie not in cookies_data["cookies"] # send the request again, only the google script should be injected response = client.get(index_url) response.render() content = BeautifulStoneSoup(response.content) assert content.find_all("script", attrs={"id": "google-script"}) assert not content.find_all("script", attrs={"id": "facebook-script"})
# octopress will not show comment input?? # ex> open, closed wp_comment_status = _(item.find("comment_status")) out.write(u'comments: %s\n' % ('true' if wp_comment_status == u'open' else 'false')) # end of yaml header out.write(u'---\n') content = _(item.find("encoded")) content = to_markdown(content.strip()) out.write(content) out.close() if __name__ == '__main__': if DEBUG: if os.access(LOGFILE, os.F_OK): os.remove(LOGFILE) # if len(sys.argv) > 1: # XML = sys.argv[1] print 'loading...' soup = BeautifulStoneSoup(open(XML), features="xml") print 'parsing...' for item in soup.find_all("item"): parse_item(item) print 'done'
from bs4 import BeautifulStoneSoup import json import os # Rip tags from dumped evernote file markup = open('../data/aaronsw.enex').read() soup = BeautifulStoneSoup(markup) posts = soup.find_all('note') tagged_posts = [i for i in posts if len(i.find_all('tag')) > 0] tagged_posts_dict = {} for post in tagged_posts: post_id = post.find_all('title')[0].text tags = [tag.text for tag in post.find_all('tag')] print post_id, tags tagged_posts_dict[post_id] = tags # Add tags to blog_posts.json blog_posts_file = open(os.path.join('..','data','blog_posts.json'),'r+') blog_posts = json.loads(blog_posts_file.read()) for post_title,post in blog_posts.iteritems(): post_tags = tagged_posts_dict.get(post['postid'],[]) blog_posts[post_title]['tags'] = post_tags