Python replace_html_entities Beispiele, corpuscrawler.util.replace_html_entities Python Beispiele

Beispiel #1

0

Datei anzeigen

def crawl_gsw_wettiger_nochrichte(crawler):
    urls = crawler.fetch_sitemap(
        'https://wettiger-nochrichte.net/sitemap.xml').keys()
    out = crawler.get_output('gsw-u-sd-chag')
    for url in sorted(urls):
        if url.find('//wettiger-nochrichte.net/20') < 0:
            continue
        html = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'<time class="entry-date" datetime="(.+?)"', html)
        html = html.split('class="post-content">')
        html = html[1].split('<style')[0]
        paragraphs = []
        for p in re.split(r'</?(p|h1|h2).+?>', html):
            p = ' '.join(replace_html_entities(striptags(p)).split())
            if ((p not in ('', 'p', 'h2', 'h3')) and (not p.startswith('http'))
                    and ('<' not in p)
                    and (not p.endswith('by Wettiger Nochrichte'))
                    and (not p.endswith('by LuFiLa'))
                    and (not p.endswith('by Wettiger'))):
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate.group(1))
            for p in paragraphs:
                out.write(p + '\n')

Beispiel #2

0

Datei anzeigen

def crawl_than_lwin_times(crawler, out):
    sitemap = crawler.fetch_sitemap('http://thanlwintimes.com/sitemap.xml')
    for url in sorted(sitemap.keys()):
        html = crawler.fetch(url).content.decode('utf-8')
        pubdate = re.search(r'<meta itemprop="datePublished" content="(.+?)"',
                            html)
        if pubdate is None:
            continue
        # prepare for split; some texts use different tags
        html = html.replace('</div><pre>', '</div><p>')
        html = html.replace('</div><div class="td-post-content"><p>',
                            '</div><p>')
        if html.find('</div><p>') < 0:
            continue
        text = html.split('</div><p>')[1]
        text = text.split('<div class=\'sfsi_Sicons ')[0]
        text = text.split('</noscript>')[0]
        text = text.replace('\n', ' ')
        text = text.replace('</p>', '\n').replace('</div>', '\n')
        paragraphs = []
        for p in text.splitlines():
            p = ' '.join(striptags(replace_html_entities(p)).split())
            if p and ('>' not in p) and (p.find('"caption":') < 0):
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s\n' % pubdate.groups(1))
            for p in paragraphs:
                out.write(p + '\n')

Beispiel #3

0

Datei anzeigen

Datei: crawl_oc.py Projekt: zhezhe123/corpuscrawler

def _find_urls_jornalet_com(crawler):
    urls = set()
    main = crawler.fetch_content('https://www.jornalet.com/actualitats')
    num_pages = max([int(p) for p
                     in re.findall(r'actualitats/pagina/(\d+)"', main)])
    for p in range(1, num_pages + 1):
        index = crawler.fetch_content(
            'https://www.jornalet.com/actualitats/pagina/%d' % p)
        for u in re.findall(r'"(https://www.jornalet.com/nova/[^"]+)"', index):
            url = replace_html_entities(u.split('#')[0])
            url = url.replace(' ', '')
            urls.add(url)
    return urls

Beispiel #4

0

Datei anzeigen

Datei: crawl_shn.py Projekt: zhezhe123/corpuscrawler

def crawl_panglong(crawler, out):
    urls = set()
    extract_urls = lambda h: re.findall(r'http://panglong.org/\?p=[0-9]+', h)
    for cat in range(1, 20):
        caturl = 'http://panglong.org/?cat=%d' % cat
        page = crawler.fetch(caturl)
        if page.status != 200:
            continue
        urls.update(extract_urls(page.content))
        pageids = re.findall(r';paged=([0-9]+)', page.content)
        if len(pageids) > 0:
            for pageid in range(2, max([int(p) for p in pageids]) + 1):
                cpurl = 'http://panglong.org/?cat=%d&paged=%d' % (cat, pageid)
                page = crawler.fetch(cpurl)
                if page.status == 200:
                    urls.update(extract_urls(page.content))
    for url in urls:
        try:
            html = crawler.fetch(url).content.decode('utf-8')
        except UnicodeDecodeError:  # a handful of documents are invalid utf8
            continue
        pubdate = re.search(r'<meta itemprop="datePublished" content="(.+)?"',
                            html)
        if pubdate is not None:
            pubdate = pubdate.group(1).strip()
        title = re.search(r'<meta property="og:title" content="(.+?)"', html)
        paras = []
        if title is not None:
            paras.append(title.group(1).strip())
        if html.find('class="entry-content">') > 0:
            text = html.split('class="entry-content">')[1]
            text = text.split('<div')[0]
            for p in text.split('</p>'):
                p = ' '.join(striptags(replace_html_entities(p)).split())
                if p:
                    paras.append(p)
        if len(paras) == 0:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

Beispiel #5

0

Datei anzeigen

def _find_tirreno_urls(crawler, category):
    site = 'http://iltirreno.gelocal.it/'
    urls = set()
    caturl = site + category
    catpage = crawler.fetch_content(caturl)
    num_pages = re.search(r'Pagina <span class="active">\d+</span> di (\d+)',
                          catpage)
    baseurl = re.search(r'<a title="Vai a pagina 1" href="([^"]+)"', catpage)
    if num_pages is None or baseurl is None:
        return urls
    num_pages = int(num_pages.group(1))
    baseurl = urljoin(site, baseurl.group(1))
    for p in range(1, num_pages + 1):
        url = '%s?page=%d' % (baseurl, p) if p > 1 else baseurl
        content = crawler.fetch_content(url)
        for u in re.findall(r'<h1><a href="([^"]+)">', content):
            u = urljoin(site, replace_html_entities(u.strip()))
            if not u.startswith('http://old.iltirreno.gelocal.it/'):
                urls.add(u)
    return urls

Beispiel #6

0

Datei anzeigen

Datei: crawl_mnw.py Projekt: zhezhe123/corpuscrawler

def crawl_mon_news(crawler, out):
    urls = set()
    for year in range(2009, datetime.today().year + 1):
        first_page = crawler.fetch('http://mon.monnews.org/%d/' % year)
        html = first_page.content.decode('utf-8')
        urls.update(extract_mon_news_urls(html))
        num_pages = re.search(
            r'<a href="http://mon.monnews.org/\d+/page/(\d+)/" class="last"',
            html)
        if num_pages != None:
            num_pages = int(num_pages.group(1))
            for page in range(2, num_pages + 1):
                next_page = crawler.fetch(
                    'http://mon.monnews.org/%d/page/%d/' % (year, page))
                if next_page.status != 200:
                    continue
                html = next_page.content.decode('utf-8')
                urls.update(extract_mon_news_urls(html))
    for url in sorted(urls):
        html = crawler.fetch(url.encode('utf-8')).content.decode('utf-8')
        pubdate = re.search(
            r'<meta property="article:published_time" content="(.+?)"', html)
        if pubdate is None:
            continue
        pubdate = pubdate.groups(1)
        text = html.split('</section>')[1].split('<div class="sharedaddy')[0]
        text = text.split('Share this:')[0]
        text = text.replace('\n', ' ')
        text = text.replace('</p>', '\n').replace('</div>', '\n')
        paragraphs = []
        for p in text.splitlines():
            p = ' '.join(striptags(replace_html_entities(p)).split())
            if p and '>' not in p:
                paragraphs.append(p)
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Publication-Date: %s\n' % pubdate)
            for p in paragraphs:
                out.write(p + '\n')

Beispiel #7

0

Datei anzeigen

Datei: crawl_gv.py Projekt: zhezhe123/corpuscrawler

def crawl_manxradio(crawler, out):
    urls = set()
    for i in range(1, 100):
        url = 'http://www.manxradio.com/news/manx-gaelic/archive/?page=%d' % i
        r = crawler.fetch(url)
        if r.status != 200 or r.content.find(b'No stories to show.') > 0:
            break
        for p in re.findall(r'<a href="/(news/manx-gaelic/[^"]+)"', r.content):
            url = 'http://www.manxradio.com/' + p
            if url.find('?') < 0:
                urls.add(url)
    for url in urls:
        r = crawler.fetch(url)
        assert r.status == 200, r.status
        html = r.content.decode('utf-8')
        pubdate = _extract_manxradio_timestamp(html)
        text = html.split('<p class="news-abstract">')
        if len(text) < 2:
            continue
        text = text[1].split('<STRONG>')[0].split('<strong>')[0]
        text = text.split('<p><span lang=""><b>')[0]
        text = text.replace('<p>', '\n').replace('</p>', '\n')
        text = text.replace('<P>', '\n').replace('</P>', '\n')
        text = striptags(replace_html_entities(text))
        text = text.replace(' - ', ' – ').replace("'", '’')
        if text.find('Listen to this audio') >= 0:
            continue
        paras = [' '.join(s.split()) for s in text.splitlines()]
        paras = [p for p in paras if p]
        if len(paras) == 0:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')

Beispiel #8

0

Datei anzeigen

def crawl_nupepa_org(crawler, out):
    urls = set()
    for i in range(1, 104):
        url = ('http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=p-0nupepa--'
               '00-0-0--010---4-----text---0-1l--1en-Zz-1---20-about---'
               '0003-1-0000utfZz-8-00&a=d&cl=CL2.' + str(i))
        doc = crawler.fetch(url)
        assert doc.status == 200, url
        content = doc.content.decode('utf-8')
        for u in re.findall(r'href="(/gsdl2.5/cgi-bin/nupepa[^"]+)"', content):
            if u.endswith('gg=text'):
                urls.add('http://nupepa.org' + replace_html_entities(u))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        assert doc.status == 200, url
        content = doc.content.decode('utf-8')
        if content.find('Document contains no data') >= 0:
            continue
        pubdate = re.search(r'tif_([0-9]{4})([01][0-9])([0123][0-9])\.tif"', content)
        pubdate = '%s-%s-%s' % (pubdate.group(1), pubdate.group(2), pubdate.group(3)) if pubdate else None
        paras = []
        while True:
            text = extract(
                "<p class=MsoNormal style='text-autospace:none'><span style='font-size:10.0pt'>",
                "</table>", content)
            if not text:
                break
            text = text.replace('\n', ' ').replace('<br>', '\n')
            text = replace_html_entities(text.replace('&nbsp;', ' '))
            paras.extend([cleantext(p) for p in text.splitlines()])
            nexturl = re.search(r'<a href="([^"]+)">next page', content)
            if nexturl is None:
                break
            nexturl = 'http://nupepa.org' + replace_html_entities(nexturl.group(1))
            doc = crawler.fetch(nexturl)
            assert doc.status == 200, (doc.status, nexturl)
            content = doc.content.decode('utf-8')
        text = '\n'.join(filter(None, paras))
        text = re.sub(
            r'DEATH OF MR\. DOUGLAS.+?has not been heard of since\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'IV\. "Their Majesties do further agree.+?by the parties\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'1 Oh, come, come away, from labor now reposing,.+?'
            r'Honolulu, Nov\. 25, 1861\. J\. L\. N\.\*', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'Died at sea, August 14.+?after a passage of about a month\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'On the 26th ult\. the Rev\. J.+?best wishes to you all\."', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'The subscriber avails himself.+?agreeable circumstances\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'NOTICE\. The publishing of.+for want of paper\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'ARRIVALS AT OAHU, SANDWICH ISLANDS,.+Sold here to the Government\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'NOTICE\. NOTICE is hereby given,.+by the subscriber\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'Articles made and agreed.+?upon the Sandwich Islands\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'MRS\. MARIA M\. DIBBLE\. Died at Lahainaluna.+?SHELDON DIBBLE\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'DEATH OF MRS\. BETSEY C\. LYONS.+?the son of man cometh\.\"', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'CARD\. The Missionary Company.+?April 20th 1837\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'DISTRESS OF THE WHALE SHIP GEORGE.+?who is now master of her\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'KNOW ALL MEN, That according.+?especially those above re-', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'cited, of the said Commissioners.+?and acknowledge the Protest', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'and withdrawal of our Deputy as our own.+?in the dominions of the Queen of',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'Taheite that I have received instructions.+?Commodore\. \[Official Copy\]',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'TO HIS MAJ\. KAMEHAMEHA.+?Naval Force in the E\. Indies\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'To the House of Representatives of the United States.+?'
            r'the arts of civilized life\.', '', text, flags=re.DOTALL)
        text = re.sub(
            r'It cannot but be in conformity.+?right to complain\.', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'The Committee on Foreign Affairs, to whom was.+?peace and love\.',
            '', text, flags=re.DOTALL)
        text = re.sub(
            r'WASHINGTON, June 25th, 1843.+?treat upon all occassions, the', '',
            text, flags=re.DOTALL)
        text = re.sub(
            r'native rulers of the Sandwich.+?P\. Upshur, &c\. &c\.', '',
            text, flags=re.DOTALL)
        if text.startswith('TERMS. One copy'):  # Article entirely in English.
            continue
        paras = filter(None, [cleantext(p) for p in text.splitlines()])
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')