Python Fetcher.make_soupの例

プログラミング言語: Python

名前空間/パッケージ名: fetcher

クラス/型: Fetcher

メソッド/関数: make_soup

hotexamples.comのコード掲載数: 3

Python Fetcher.make_soup - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのfetcher.Fetcher.make_soupの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Fetcher(30)

fetch(14)

stop(7)

_get_helper(5)

mark_paragraphs(3)

insertDataToMySQL(2)

tokenize_html(2)

children(2)

get(2)

raw_fetch_url(2)

make_soup(2)

host(1)

populate_products_has_tags(1)

header(1)

populate_products(1)

populate_categories(1)

len(1)

movie_inf(1)

left(1)

install_modpack(1)

insert_thumbs(1)

populate_tags(1)

resample(1)

query_video_information(1)

statistics(1)

url_valid(1)

url(1)

update_thumbs(1)

update_modpack(1)

to_decoder(1)

sync_topic_to_es(1)

start(1)

request(1)

setReferer(1)

setCredentials(1)

send(1)

search_movie(1)

runFetcher(1)

run(1)

get_subscriptions(1)

get_user_data(1)

get_latest_season(1)

get_ranked_pages(1)

coutries_data(1)

extract_dns(1)

entries(1)

download(1)

downLoadContent(1)

create_table(1)

コード例 #1

ファイルを表示

ファイル: scraper.py プロジェクト: haukurb/Reynir

    def scrape_root(self, root, helper):
        """ Scrape a root URL """

        t0 = time.time()
        # Fetch the root URL and scrape all child URLs that refer
        # to the same domain suffix and we haven't seen before
        logging.info("Fetching root {0}".format(root.url))

        # Read the HTML document at the root URL
        html_doc = Fetcher.raw_fetch_url(root.url)
        if not html_doc:
            logging.warning("Unable to fetch root {0}".format(root.url))
            return

        # Parse the HTML document
        soup = Fetcher.make_soup(html_doc)

        # Obtain the set of child URLs to fetch
        fetch_set = Fetcher.children(root, soup)

        # Add the children whose URLs we don't already have to the
        # scraper articles table
        with SessionContext() as session:

            for url in fetch_set:

                if helper and helper.skip_url(url):
                    # The helper doesn't want this URL
                    continue

                # noinspection PyBroadException
                try:
                    article = ArticleRow(url=url, root_id=root.id)
                    # Leave article.scraped as NULL for later retrieval
                    session.add(article)
                    session.commit()
                except IntegrityError as e:
                    # Article URL already exists in database:
                    # roll back and continue
                    session.rollback()
                except Exception as e:
                    logging.warning(
                        "Roll back due to exception in scrape_root: {0}"
                        .format(e)
                    )
                    session.rollback()

        t1 = time.time()

        logging.info("Root scrape completed in {0:.2f} seconds".format(t1 - t0))

コード例 #2

ファイルを表示

    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = None if helper is None else helper.feeds

        if feeds:

            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(
                            feed_url, str(e)))
                    continue
                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)

        else:

            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set

コード例 #3

ファイルを表示

ファイル: scraper.py プロジェクト: vthorsteinsson/Reynir

    def urls2fetch(self, root, helper):
        """ Returns a set of URLs to fetch. If the scraper helper class has
            associated RSS feed URLs, these are used to acquire article URLs.
            Otherwise, the URLs are found by scraping the root website and
            searching for links to subpages. """
        fetch_set = set()
        feeds = helper.feeds

        if feeds:
            for feed_url in feeds:
                logging.info("Fetching feed {0}".format(feed_url))
                try:
                    d = feedparser.parse(feed_url)
                except Exception as e:
                    logging.warning(
                        "Error fetching/parsing feed {0}: {1}".format(feed_url, str(e))
                    )
                    continue

                for entry in d.entries:
                    if entry.link and not helper.skip_rss_entry(entry):
                        fetch_set.add(entry.link)
        else:
            # Fetch the root URL and scrape all child URLs
            # that refer to the same domain suffix
            logging.info("Fetching root {0}".format(root.url))

            # Read the HTML document at the root URL
            html_doc = Fetcher.raw_fetch_url(root.url)
            if not html_doc:
                logging.warning("Unable to fetch root {0}".format(root.url))
                return

            # Parse the HTML document
            soup = Fetcher.make_soup(html_doc)

            # Obtain the set of child URLs to fetch
            fetch_set = Fetcher.children(root, soup)

        return fetch_set