Esempi in Python per extract_pages

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: src.preprocessing.preprocessors

Metodo/funzione: extract_pages

Esempi su hotexamples.com: 7

extract_pages in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per src.preprocessing.preprocessors.extract_pages, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_skip_by_class(self):
     """Don't extract links of some class."""
     html = ('<a href="/wiki/foo" class="image"><img src="url" /></a>'
             '<a class="internal" href="/wiki/foo" title="foo">foo</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert len(list(links)) == 0

Esempio n. 2

Mostra file

def scrap_portal(language, lang_config):
    """Get the portal index and scrap it."""
    # get the portal url, get out if don't have it
    portal_index_title = lang_config.get('portal_index')
    if portal_index_title is None:
        logger.info("Not scraping portals, url not configured.")
        return

    logger.info("Scraping portal main page %s", portal_index_title)
    with NamedTemporaryFile('wt',
                            encoding='utf8',
                            dir='/tmp/',
                            prefix='cdpedia-') as tf:
        tf.write(portal_index_title + '\n')
        tf.flush()
        _call_scraper(language, tf.name)

    dir3, quoted_page = to3dirs.get_path_file(portal_index_title)
    portal_filepath = os.path.join(location.articles, dir3, quoted_page)

    logger.info("Parsing portal page")
    with open(portal_filepath, 'rt', encoding='utf8') as fh:
        soup = bs4.BeautifulSoup(fh, features="html.parser")

    cnt = 0
    _path = os.path.join(location.langdir, PORTAL_PAGES)
    with open(_path, 'wt', encoding='utf8') as fh:
        for page in preprocessors.extract_pages(soup):
            cnt += 1
            fh.write(page + '\n')

    logger.info("Scraping portal sub pages (total=%d)", cnt)
    _call_scraper(language, _path)

    logger.info("Portal scraping done")

Esempio n. 3

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_extract_portal_link_redirect(self):
     """Redirection links to portal pages must be extracted."""
     html = ('<a href="/wiki/Portal:Astron%C3%A1utica" class="mw-redirect" '
             'title="Portal:Astronáutica">Astronáutica</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Portal:Astronáutica']

Esempio n. 4

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_extract_portal_link_normal(self):
     """Links to portal pages must be extracted."""
     html = ('<a href="/wiki/Portal:Exploraci%C3%B3n_espacial" '
             'title="Portal:Exploración espacial">Exploración espacial</a>')
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Portal:Exploración_espacial']

Esempio n. 5

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_remove_link_fragment(self):
     """Remove fragment from page URL."""
     html = '<a href="/wiki/foo#bar">foobar</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['foo']

Esempio n. 6

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_skip_non_wiki_urls(self):
     """Don't extract links without a '/wiki/' prefix."""
     html = '<a href="/nowiki/foo">foo</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == []

Esempio n. 7

Mostra file

File: test_preprocessors.py Progetto: y-martinez/CDPedia

 def test_extract_link(self):
     """Normal links to wiki pages must be extracted."""
     html = '<a href="/wiki/N%C3%BAmero_natural" title="Número natural">número natural</a>'
     wikifile = FakeWikiFile(html)
     links = extract_pages(wikifile.soup)
     assert list(links) == ['Número_natural']