Python BeautifulStoneSoup.find_all Exemples, bs4.BeautifulStoneSoup.find_all Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : wikipedia.py Projet : Mause/ninjabot

	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the 'regex's.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))

Exemple #2

0

Afficher le fichier

	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the regex.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))

Exemple #3

0

Afficher le fichier

Fichier : productor.py Projet : Monk-Liu/CS

    def run(self):
        while True:
            try:
                maxid = self.db.news_list()[0]['id']
            except:
                maxid = 1
            print(maxid)
            client = HTTPClient()
            response = client.fetch('http://cs.hust.edu.cn/rss')
            result = response.body.decode("utf-8",errors='ignore')
            soup = BeautifulStoneSoup(result)
            
            items = soup.find_all('item')
            for item in items:
                title = item.title.text
                link = item.link.text
                desc = item.description.text
                linkid = self.link_id(link)
                if linkid > maxid:
                    result = self.db.add_news(linkid,title,desc,link)
                    if result:
                        result = self.get_article(link)
                else:
                    break

            time.sleep(3600)

Exemple #4

0

Afficher le fichier

Fichier : core.py Projet : anton-trapeznikov/RecipeParser

    def _parse_html(self, url):
        '''
        Метод загружает страницу из url и обрабатывает все ссылки на ней присутствующие.
        Рекурсивно вызывает самого себя для первой из еще не обработанных ссылок, т.о.
        парсится весь сайт.

        '''

        html = None
        page_content = None

        self._processed.add(url)
        self._recursion_counter += 1

        try:
            with urllib.request.urlopen(url) as response:
                html = response.read()
        except Exception:
            html = None
            print('Unable to load url %s' % url)

        if html:
            try:
                page_content = Soup(html)
            except Exception:
                page_content = None

        if page_content:
            stop_list = ('#', '', '/')

            for a in page_content.find_all('a', href=True):
                if a['href'] not in stop_list:
                    href = self._build_link(url=a['href'], location_parts=urlparse(url))
                    if href:
                        try:
                            self._url_validator(href)
                        except ValidationError:
                            print('%s is not valid url' % href)
                        else:
                            self._finds.add(href)

            self._add_location(url)

        unprocessed = self._finds - self._processed

        print('Всего страниц: %s. Обработано страниц: %s. Найдено рецептов: %s. Последний URL: %s' %
            (len(self._finds), len(self._processed), len(self._urls), url))

        # На каждом 20-ом вызове данного метода сохраняем self._urls
        if self._recursion_counter % 20:
            self._save()
            self._recursion_counter = 0

        if unprocessed:
            if self.sleep_time > 0:
                time.sleep(self.sleep_time)

            next_url = list(unprocessed)[0]
            self._parse_html(next_url)

Exemple #5

0

Afficher le fichier

Fichier : ParcerBody.py Projet : chmelevme/somework

 def get_all_urls(self):
     """Возвращает список url"""
     list_of_urls = []
     for url in self.urls:
         request = self.session.get(url, headers=self.headers)
         soap = BeautifulStoneSoup(request.content)
         urls = soap.find_all('loc')
         list_of_urls += [url.next_element for url in urls]
     return list_of_urls

Exemple #6

0

Afficher le fichier

    def render(self, context):
        fancount = ''
        fb_api_url = 'http://api.facebook.com/restserver.php'
        tw_api_url = 'http://api.twitter.com'

        cache_key = ''
        cache_time = 1800

        if self.service == "facebook":
            query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s'
            xml_path = query % (fb_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('page')
                    for node in nodes:
                        fancount = node.fan_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        if self.service == "twitter":
            query = "%s/1/users/show/%s.xml"
            xml_path = query % (tw_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('user')
                    for node in nodes:
                        fancount = node.followers_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        return fancount

Exemple #7

0

Afficher le fichier

Fichier : base_tags.py Projet : tendenci/tendenci

    def render(self, context):
        fancount = ''
        fb_api_url = 'http://api.facebook.com/restserver.php'
        tw_api_url = 'http://api.twitter.com'

        cache_key = ''
        cache_time = 1800

        if self.service == "facebook":
            query = '%s?method=facebook.fql.query&query=SELECT%%20fan_count%%20FROM%%20page%%20WHERE%%20page_id=%s'
            xml_path = query % (fb_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('page')
                    for node in nodes:
                        fancount = node.fan_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        if self.service == "twitter":
            query = "%s/1/users/show/%s.xml"
            xml_path = query % (tw_api_url, self.service_id)
            cache_key = md5(xml_path.encode()).hexdigest()
            fancount = cache.get(cache_key)
            if not fancount:
                try:
                    xml = urlopen(xml_path)
                    content = xml.read()
                    soup = BeautifulStoneSoup(content)
                    nodes = soup.find_all('user')
                    for node in nodes:
                        fancount = node.followers_count.string
                    cache.set(cache_key, fancount, cache_time)
                except:
                    pass

        return fancount

Exemple #8

0

Afficher le fichier

Fichier : parse.py Projet : jschwinger23/ldoce2anki

def parse(word: str, soup: BeautifulStoneSoup) -> dict:
    entries = []
    word = {'word': word, 'entries': entries}
    for entry in soup.find_all(class_='ldoceEntry Entry'):
        entries.append({})
        last_entry = entries[-1]
        with suppress(AttributeError):
            american_pron = entry.find(class_='AMEVARPRON')
            american = f'/{american_pron.text.strip()}/' if american_pron else ''
            last_entry['pron'] = '/{english}/ {american}'.format(
                english=entry.find(class_='PRON').text.strip(),
                american=american,
            ).rstrip()
        try:
            last_entry['pos'] = entry.find(class_='POS').text.strip()
        except AttributeError:
            entries.pop()
            continue

        senses = last_entry['senses'] = []
        for sense in entry.find_all(class_='Sense'):
            senses.append({})
            last_sense = senses[-1]
            try:
                last_sense['definition'] = sense.find(
                    class_='DEF').text.strip()
            except AttributeError:
                try:
                    last_sense['definition'] = sense.find(
                        class_='REFHWD').text.strip()
                except AttributeError:
                    senses.pop()
                    continue

            find_rel = sense.find(class_='RELATEDWD')
            if find_rel:
                last_sense['rel'] = find_rel.text.strip()[2:]

            find_syn = sense.find(class_='SYN')
            if find_syn:
                last_sense['syn'] = find_syn.text.strip()[4:]

            find_opp = sense.find(class_='OPP')
            if find_opp:
                last_sense['opp'] = find_opp.text.strip()[4:]

            last_sense['examples'] = [
                e.text.strip() for e in sense.find_all(class_='EXAMPLE')
            ]
    return word

Exemple #9

0

Afficher le fichier

Fichier : tasks.py Projet : MohammedRashidKP/tencenci

    def run(self, file_name, user, **kwargs):
        """
        Parse the given xml file using BeautifulSoup. Save all Article, Redirect and Page objects.
        """
        f = open(file_name, 'r')
        xml = f.read()
        f.close()

        soup = BeautifulStoneSoup(xml)
        items = soup.find_all('item')

        for item in items:
            post_type = item.find('wp:post_type').string
            post_status = item.find('wp:status').string

            if post_type == 'attachment':
                get_media(item, user)
                # Note! This script assumes all the attachments come before
                # posts and pages in the xml. If this ends up changing,
                # do two loops, one with attachments and the second with posts and pages.
            elif post_type == 'post' and post_status == 'publish':
                get_posts(item, user)
            elif post_type == 'page' and post_status == 'publish':
                get_pages(item, user)

        if user.email:
            context = {
                'SITE_GLOBAL_SITEDISPLAYNAME':
                get_setting('site', 'global', 'sitedisplayname'),
                'SITE_GLOBAL_SITEURL':
                get_setting('site', 'global', 'siteurl'),
            }
            subject = ''.join(
                render_to_string(
                    template_name=('notification/wp_import/short.txt'),
                    context=context).splitlines())
            body = render_to_string(
                template_name=('notification/wp_import/full.html'),
                context=context)

            #send_mail(subject, body, settings.DEFAULT_FROM_EMAIL, [user.email], fail_silently=False)
            email = EmailMessage(subject, body, settings.DEFAULT_FROM_EMAIL,
                                 [user.email])
            email.content_subtype = 'html'
            email.send(fail_silently=True)

Exemple #10

0

Afficher le fichier

Fichier : test_consent_resource.py Projet : rrosajp/shuup

def test_consent_block_snippet_injection(rf):
    """
    Test that the GDPR consent is required to inject xtheme scripts
    """
    shop = factories.get_default_shop()

    client = SmartClient()
    index_url = reverse("shuup:index")

    # create a GDPR setting for the shop
    shop_gdpr = GDPRSettings.get_for_shop(shop)
    shop_gdpr.cookie_banner_content = "my cookie banner content"
    shop_gdpr.cookie_privacy_excerpt = "my cookie privacyexcerpt"
    shop_gdpr.enabled = True
    shop_gdpr.save()

    # configure some snippets to be injected
    google_snippet = Snippet.objects.create(
        name="Google Analytics",
        snippet_type=SnippetType.InlineHTMLMarkup,
        location="body_end",
        shop=shop,
        snippet='<script id="google-script"></script>',
    )

    facebook_snippet = Snippet.objects.create(
        name="Facebook Pixel",
        snippet_type=SnippetType.InlineHTMLMarkup,
        location="body_end",
        shop=shop,
        snippet='<script id="facebook-script"></script>',
    )

    # create cookie categories
    required_cookie_category = GDPRCookieCategory.objects.create(
        shop=shop,
        always_active=True,
        cookies="cookie1,cookir2,_cookie3",
        name="RequiredCookies",
        how_is_used="to make the site work",
    )
    google_cookie_category = GDPRCookieCategory.objects.create(
        shop=shop,
        always_active=False,
        cookies="_google",
        name="GoogleCookies",
        how_is_used="to spy users",
    )
    google_cookie_category.block_snippets.add(google_snippet)

    faceboook_cookie_category = GDPRCookieCategory.objects.create(
        shop=shop,
        always_active=False,
        cookies="_facebook",
        name="Facebook",
        how_is_used="to track users",
    )
    faceboook_cookie_category.block_snippets.add(facebook_snippet)

    # create privacy policy GDPR document
    ensure_gdpr_privacy_policy(shop)
    response = client.get(index_url)
    assert settings.SHUUP_GDPR_CONSENT_COOKIE_NAME not in response.cookies

    # send consent only for the required and google
    response = client.post(
        reverse("shuup:gdpr_consent"),
        data={
            "cookie_category_{}".format(required_cookie_category.id): "on",
            "cookie_category_{}".format(google_cookie_category.id): "on",
            "cookie_category_{}".format(faceboook_cookie_category.id): "off",
        },
    )

    assert settings.SHUUP_GDPR_CONSENT_COOKIE_NAME in response.cookies
    cookies_data = json.loads(
        response.cookies[settings.SHUUP_GDPR_CONSENT_COOKIE_NAME].value)

    for cookie in required_cookie_category.cookies.split(","):
        assert cookie in cookies_data["cookies"]
    for cookie in google_cookie_category.cookies.split(","):
        assert cookie in cookies_data["cookies"]
    for cookie in faceboook_cookie_category.cookies.split(","):
        assert cookie not in cookies_data["cookies"]

    # send the request again, only the google script should be injected
    response = client.get(index_url)
    response.render()

    content = BeautifulStoneSoup(response.content)
    assert content.find_all("script", attrs={"id": "google-script"})
    assert not content.find_all("script", attrs={"id": "facebook-script"})

Exemple #11

0

Afficher le fichier

Fichier : importer.py Projet : Reiot/droppress

    # octopress will not show comment input??
    # ex> open, closed
    wp_comment_status = _(item.find("comment_status"))
    out.write(u'comments: %s\n' % ('true' if wp_comment_status == u'open' else 'false'))

    # end of yaml header
    out.write(u'---\n')

    content = _(item.find("encoded"))
    content = to_markdown(content.strip())
    out.write(content)

    out.close()

if __name__ == '__main__':

    if DEBUG:
        if os.access(LOGFILE, os.F_OK):
            os.remove(LOGFILE)

    # if len(sys.argv) > 1:
    #     XML = sys.argv[1]

    print 'loading...'
    soup = BeautifulStoneSoup(open(XML), features="xml")
    print 'parsing...'
    for item in soup.find_all("item"):
        parse_item(item)
    print 'done'

Exemple #12

0

Afficher le fichier

Fichier : rip_tags.py Projet : psawaya/aaronsw_archive

from bs4 import BeautifulStoneSoup

import json
import os

# Rip tags from dumped evernote file
markup = open('../data/aaronsw.enex').read()
soup = BeautifulStoneSoup(markup)

posts = soup.find_all('note')

tagged_posts = [i for i in posts if len(i.find_all('tag')) > 0]
tagged_posts_dict = {}

for post in tagged_posts:
    post_id = post.find_all('title')[0].text

    tags = [tag.text for tag in post.find_all('tag')]

    print post_id, tags
    tagged_posts_dict[post_id] = tags


# Add tags to blog_posts.json
blog_posts_file = open(os.path.join('..','data','blog_posts.json'),'r+')
blog_posts = json.loads(blog_posts_file.read())

for post_title,post in blog_posts.iteritems():
    post_tags = tagged_posts_dict.get(post['postid'],[])
    blog_posts[post_title]['tags'] = post_tags