Python clean_paragraphsの例、corpuscrawler.util.clean_paragraphs Pythonの例

コード例 #1

0

ファイルを表示

ファイル: crawl_ny.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_mwnation_com(crawler, out):
    urls = set()
    index = crawler.fetch_content('http://mwnation.com/section/chichewa/')
    pages = re.findall(r'/section/chichewa/page/(\d+)/', index)
    num_pages = max([int(p) for p in pages])
    for page in range (1, num_pages + 1):
        url = 'http://mwnation.com/section/chichewa/'
        if page > 1:
            url += 'page/%d/' % page
        doc = crawler.fetch_content(url)
        urls.update(re.findall(r'<a href="([^"]+?)">Continue Reading', doc))
    for url in sorted(urls):
        doc = crawler.fetch_content(url)
        pubdate = re.search(
            r'<meta property="article:published_time" content="([^"]+)"', doc)
        pubdate = pubdate.group(1) if pubdate is not None else None
        title = extract('<h1 class="entry-title" itemprop="headline">',
                        '</h1>', doc) or ''
        body = extract('<div class="entry-content" itemprop="articleBody">',
                       '<footer ', doc) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        text = '\n'.join(paras) + '\n'
        if text.find(' the ') >= 0:  # likely English
            continue
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(text)

コード例 #2

0

ファイルを表示

ファイル: crawl_lt.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_kauno_diena_lt(crawler, out):
    urls = {}
    for i in range(1, 6):
        url = 'http://kauno.diena.lt/sitemap/kd/sitemap%d.xml' % i
        urls.update(crawler.fetch_sitemap(url))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        try:
            html = doc.content.decode('utf-8')
        except UnicodeDecodeError:
            continue
        title = extract('<h1 class="title" id="page-title">', '</h1>', html)
        title = cleantext(title if title else '')
        body = extract("<span itemprop='articleBody'>", '</div>', html) or ''
        paras = []
        for p in clean_paragraphs('%s<br/>%s' % (title, body)):
            if 'MicrosoftInternetExplorer4' in p:
                break
            paras.append(p)
        pubdate = re.search(
            r'<span\s+property="dc:date\s+dc:created"\s+content="(20[^"]+)"',
            html)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #3

0

ファイルを表示

def _crawl_observador_pt(crawler, out):
    urls = set()
    for author_page in sorted(
            re.findall(
                r'href="(https?://observador.pt/perfil/[a-zA-Z_\-0-9]+/)"',
                crawler.fetch_content('http://observador.pt/autores/'))):
        html = crawler.fetch_content(author_page)
        urls.update(
            re.findall(
                r'href="(https?://observador.pt/20\d{2}/\d{2}/\d{2}/[^"]+)"',
                html))
    for url in sorted(urls):
        try:
            html = crawler.fetch_content(url)
        except UnicodeDecodeError:
            continue
        title = re.search(r'<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) or ''
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        pubdate = pubdate.group(1) or None
        lead = extract('<div class="lead">', '</div>', html) or ''
        content = extract('<div class="content">', '<h1>', html) or ''
        text = '\n'.join(clean_paragraphs('<p>'.join([title, lead, content])))
        text = text.split('\nContinuar a ler')[0]
        text = text.split('\nLer mais')[0]
        text = text.split('\nPartilhe')[0]
        text = text.split('\nComente')[0]
        if text:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write(text)
            out.write('\n')

コード例 #4

0

ファイルを表示

def crawl_blogspot(crawler, out, host):
    sitemap = crawler.fetch_sitemap('https://%s/sitemap.xml' % host)
    pubdate_regex = re.compile(
        r"<abbr class='published' title='([^']*)'>[^<]*</abbr>")
    for url in sorted(sitemap.keys()):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        if pubdate is None: pubdate = fetchresult.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[url]
        title = re.search(r"<meta content='([^']+)' property='og:title'/>",
                          html)
        title = title.group(1) if title else ''
        post = extract("<div class='post-body entry-content'>",
                       "<div class='post-footer'>", html)
        paras = clean_paragraphs(title + '<br/>' + post)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Blog\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #5

0

ファイルを表示

def crawl_tuairisc_ie(crawler, out):
    sitemap = crawler.fetch_sitemap('https://tuairisc.ie/sitemap.xml')
    pubdate_regex = re.compile(
        r'<time datetime="(20\d\d-\d\d-\d\d)\s+(\d\d:\d\d)" '
        r'itemprop="datePublished">')
    for url in sorted(sitemap.keys()):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = extract('<h1 class="title article--full__title">', '</h1>',
                        html) or ''
        pubdate_match = pubdate_regex.search(html)
        if pubdate_match:
            pubdate = '%sT%s:00Z' % (pubdate_match.group(1),
                                     pubdate_match.group(2))
        else:
            pubdate = sitemap[url]
        body = extract(
            '<div class="article--full__content" itemprop="articleBody">',
            '</article>', html)
        if not body:
            continue
        paras = clean_paragraphs(title + '<p/>' + body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #6

0

ファイルを表示

def crawl_ainm_ie(crawler, out):
    links = set()
    for let in map(chr, range(65, 91)):
        idxres = crawler.fetch('https://www.ainm.ie/Abc.aspx?Letter=%s' % let)
        if idxres.status != 200:
            continue
        idxhtml = idxres.content.decode('utf-8')
        index = extract('<div id="pageContent" role="main">',
                        '<!-- .contentWrapper-->', idxhtml)
        for link in re.findall(r'<a href="(Bio.aspx\?ID=[^"]+?)">', index):
            links.add('https://www.ainm.ie/%s' % link)
    for url in sorted(links):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('|')[0] if title else ''
        body = extract('<div class="article">', '<!-- .contentWrapper-->',
                       html) or ''
        body = body.split('<div id="machines"')[0]
        paras = clean_paragraphs(title + '<br/>' + body)
        pubdate = fetchresult.headers.get('Last-Modified')
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Biography\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #7

0

ファイルを表示

def _crawl_iltirreno_gelocal_it(crawler, out):
    urls = set()
    for category in ('italia-mondo', 'focus/toscana-economia',
                     'empoli/cronaca', 'grosseto/cronaca',
                     'livorno/cronaca', 'livorno/dagli-enti',
                     'lucca/cronaca', 'pisa/cronaca', 'prato/cronaca',
                     'versilia/cronaca'):
        urls.update(_find_tirreno_urls(crawler, category))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        header = extract('<h1 itemprop="headline name">',
                         '<span itemprop="author"', content) or ''
        body = extract('<span itemprop="articleBody" >', '©', content) or ''
        paras = clean_paragraphs('%s<p/>%s' % (header, body))
        text = '\n'.join(paras)
        for sep in ('Tags\n', 'Redazione | Scriveteci', 'TrovaRistorante',
                    '<a href="', 'I COMMENTI DEI LETTORI', '©RIPRODUZIONE'):
            text = text.split(sep)[0]
        paras = text.splitlines()
        pubdate = re.search(
            r'<time itemprop="datePublished" content="([^"]+)"', content)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #8

0

ファイルを表示

def crawl_meoneile_ie(crawler, out):
    sitemap = crawler.fetch_sitemap('https://meoneile.ie/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if url == 'https://meoneile.ie/':
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = extract(r'<title>', '</title>', html).strip()
        title = title.split('&lt;')[0].strip() if title else ''
        video = re.search(
            r"<iframe.*src='(//player.vimeo.com/video/[0-9]+)[^>]*></iframe>",
            html)
        body = extract("<div class='article-content'>", '</article>',
                       html) or ''
        byline = extract("<div class='byline'>", '</span>', html) or ''
        byline = _byline_to_pubdate(byline)
        if body.find('<strong>%s</strong>' % title) >= 0:
            title = ''
        paras = clean_paragraphs(title + '<br/>' + body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if video:
                out.write('# Video: https:%s\n' % video.group(1))
            if byline:
                out.write('# Publication-Date: %s\n' % byline)
            for para in paras:
                if para == 'Roinn':
                    continue
                else:
                    out.write(para + '\n')

コード例 #9

0

ファイルを表示

def _crawl_telegraaf_nl(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'http://www.telegraaf.nl/sitemap.xml',
        subsitemap_filter=_should_fetch_telegraaf_sitemap)
    for url in sorted(sitemap):
        doc = crawler.fetch(urlencode(url))
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search(
            r'<meta [a-zA-Z\-="]* property="og:title" content="(.+?)"', html)
        title = title.group(1) if title else ''
        pubdate = re.search(r'"publishDate":"([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else None
        text = extract(
            'data-element="ArticlePage-intro">',
            '<div class="flex" data-element="ArticlePage-socialShare-root">',
            html) or ''
        paras = clean_paragraphs(title + '<br/>' + text)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #10

0

ファイルを表示

ファイル: crawl_tt.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_vatantat_ru(crawler, out):
    index = crawler.fetch_content('http://www.vatantat.ru/')
    last = max([int(p) for p in re.findall(r'index\.php\?pg=(\d+?)"', index)])
    for page in range(2, last + 1):
        url = 'http://www.vatantat.ru/index.php?pg=%d' % page
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        content = doc.content.decode('utf-8')
        html = extract('<p><span style="font-size: large;"><strong>',
                       '<span style="font-size: 80%; font-weight: bold;">',
                       content)
        if not html:
            continue
        html = html.split('(“Ватаным Татарстан”,')[0]
        html = html.split('<script>')[0]
        paras = clean_paragraphs(html)
        if not paras:
            continue
        pubdate = re.search(
            r'Татарстан”,&nbsp;&nbsp;&nbsp;/№&nbsp;(none|\d+),&nbsp;'
            r'(\d\d)\.(\d\d)\.(20\d\d)/', content)
        if pubdate is not None:
            pubdate = ('%s-%s-%s' %
                       (pubdate.group(4), pubdate.group(3), pubdate.group(2)))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

コード例 #11

0

ファイルを表示

ファイル: crawl_sah.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_kyym_ru(crawler, out):
    index = crawler.fetch_content('http://www.kyym.ru/')
    last = max([
        int(s) for s in re.findall(r'href="/index\.php\?start=(\d+?)"', index)
    ])
    urls = set()
    for page in range(1, last + 1):
        doc = crawler.fetch_content('http://www.kyym.ru/index.php?start=%d' %
                                    page)
        for path in re.findall(r'<a href="(/index\.php\?view=article&[^"]+?)"',
                               doc):
            urls.add('http://www.kyym.ru' + path.replace('&amp;', '&'))
    for url in sorted(urls):
        doc = crawler.fetch_content(url)
        html = extract('<div class="news_item_article">',
                       '<!--end news item -->', doc)
        if not html:
            continue
        paras = clean_paragraphs(html)
        if not paras:
            continue
        pubdate = re.search(
            r'<span class="createdate"><!-- date and by -->'
            r'\s*(\d{1,2}).(\d{2}).(20\d{2})',
            doc,
            flags=re.DOTALL)
        if pubdate is not None:
            pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2),
                                    pubdate.group(1))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

コード例 #12

0

ファイルを表示

def crawl_peig_ie(crawler, out):
    crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1_2))
    sitemap = crawler.fetch_sitemap('https://peig.ie/sitemap_index.xml',
                                    subsitemap_filter=_peig_filter_robots)

    def peig_cat(page):
        if page.find('/imeachtai/') >= 0:
            return 'Events'
        elif page.find('peig.ie/20') >= 0:
            return 'News'
        elif page.find('/fol%C3%BAntais/') >= 0:
            return 'Job listings'
        else:
            return ''

    # Peig.ie has a lot of posts from other sites
    def skip_page(site):
        if site.find('//nos.ie/') >= 0:
            return True
        elif site.find('//tuairisc.ie/') >= 0:
            return True
        elif site.find('//meoneile.ie/') >= 0:
            return True
        else:
            return False

    for url in sorted(sitemap.keys()):
        if url == 'https://peig.ie/':
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('|')[0].strip() if title else ''
        read_more = re.search(r'<a.*href="([^"]+")[^>]*>Níos mó</a>', html)
        if read_more and skip_page(read_more.group(1)):
            continue
        if '<meta property="article:modified_time"' in html:
            date = re.search(
                r'<meta property="article:modified_time" content="([^"]+)"',
                html).group(1)
        else:
            date = re.search(r'"datePublished":"([^"]+)"', html).group(1)
        body = extract('<div class="uk-margin-medium-top" property="text">',
                       '<ul class="uk-pagination', html) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        genre = peig_cat(url)
        if paras:
            out.write('# Location: %s\n' % url)
            if genre:
                out.write('# Genre: %s\n' % genre)
            if date:
                out.write('# Publication-Date: %s\n' % date)
            out.write('\n'.join(paras) + '\n')
    crawler.set_context(ssl.SSLContext(ssl.PROTOCOL_TLSv1))

コード例 #13

0

ファイルを表示

ファイル: crawl_sat.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_asymptotejournal_com(crawler, out):
    url = ('https://www.asymptotejournal.com/nonfiction/'
           'shibu-tudu-memories-of-the-kirta-dangra/santhali/')
    html = crawler.fetch_content(url)
    content = extract('<!-- article content -->',
                      '<img src="/images/end-logo-black.gif"', html)
    out.write('# Location: %s\n' % url)
    out.write('# Genre: Fiction\n')
    paras = clean_paragraphs(content)
    paras = [p for p in paras if p[0] not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
    out.write('\n'.join(paras) + '\n')

コード例 #14

0

ファイルを表示

def crawl_pl_usembassy_gov(crawler, out):
    sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml')
    trans_regex = re.compile(
        r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"'
    )
    pubdate_regex = re.compile(
        r'<meta property="article:published_time" content="([^"]*)"'
    )
    links = set()
    for key in sorted(sitemap.keys()):
        if _pl_usembassy_gov_path(key):
            links.add(key)
    for link in sorted(links):
        result = crawler.fetch(link)
        if result.status != 200:
            continue
        html = result.content.decode('utf-8')
        title = extract('<title>', '</title>', html)
        title = title if title else ''
        title = title.split(' | ')[0] if ' | ' in title else title
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        trans_match = trans_regex.search(html)
        trans = trans_match.group(1) if trans_match else None
        if pubdate is None: pubdate = result.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[link]
        exstart = '<div class="entry-content">'
        exstart2 = '<div class="mo-page-content">'
        exend = '<!-- AddThis Advanced Settings above via filter on the_content -->'
        exstart = exstart2 if exstart2 in html else exstart
        content = extract(exstart, exend, html)
        cleanparas = clean_paragraphs(content) if content else None
        # Don't repeat the title if it's the only text content
        cleantitle = cleantext(title)
        if cleanparas:
            if len(cleanparas) == 1 and cleanparas[0] == cleantitle:
                paras = [cleantitle]
            else:
                paras = [cleantitle] + cleanparas
        else:
            paras = [cleantitle]
        # There are quite a few media pages whose only text is the filename
        # this, conveniently, is typically also the post's name
        if len(paras) == 1 and paras[0].lower() in urlpath(link).lower():
            continue
        if paras:
            out.write('# Location: %s\n' % link)
            out.write('# Genre: Diplomatic\n')
            if trans:
                out.write('# Translation: %s\n' % trans)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #15

0

ファイルを表示

def _crawl_pravda_sk(crawler, out):
    for url in sorted(_find_urls_on_pravda_sk(crawler)):
        doc = crawler.fetch_content(url)
        title = re.search(r'<h1[^>]*>(.+?)</h1>', doc)
        title = title.group(1) if title else ''
        pubdate = re.search(
            '<meta property="article:published_time" content="(.+?)"', doc)
        pubdate = pubdate.group(1) if pubdate else None
        text = extract('<div class="article-detail-perex">',
                       '<div class="clearfix">', doc) or ''
        paras = clean_paragraphs(title + '<br/>' + text)
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

コード例 #16

0

ファイルを表示

ファイル: crawl_iba.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_utusan_borneo_berita_iban(crawler, out):
    for url in sorted(_find_urls_utusan_borneo_berita_iban(crawler)):
        doc = crawler.fetch_content(url)
        title = re.search(r'<meta property="og:title" content="(.+?)"', doc)
        title = title.group(1) if title else ''
        paras = clean_paragraphs('<h1>%s</h1>' % title +
                                 extract('<p>', '<footer>', doc))
        pubdate = re.search(
            r'<meta property="article:published_time" content="([\d\-]+)"',
            doc)
        pubdate = pubdate.group(1) if pubdate else None
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #17

0

ファイルを表示

def _rte_cleanall(html):
    section_article_regex = re.compile(
        r'<section[^>]+itemprop="articleBody"[^>]*>')
    search = section_article_regex.search(html)
    out = []
    if search:
        body = extract(search.group(0), '</section>', html)
        for para in clean_paragraphs(body):
            if _rte_writable_paragraph(para):
                out.append(para)
        return '\n'.join(out)
    for paragraph in re.findall(r'<p>(.+?)</p>', html):
        cleaned = cleantext(paragraph)
        if _rte_writable_paragraph(cleaned):
            out.append(cleaned)
        else:
            continue
    return '\n'.join(out)

コード例 #18

0

ファイルを表示

ファイル: crawl_oc.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_jornalet_com(crawler, out):
    for url in sorted(_find_urls_jornalet_com(crawler)):
        try:
            html = crawler.fetch_content(url)
        except UnicodeDecodeError:
            continue
        title = re.search(r'<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) if title else ''
        subtitle = extract('<h4 class="subtitol">', '</h4>', html) or ''
        content = extract('<p class="contingut">', '<hr', html) or ''
        paras = clean_paragraphs('\n'.join(
            ['<p>%s</p>' % p for p in (title, subtitle, content) if p]))
        paras = [p for p in paras if p.find('Abonar los amics de Jornalet') < 0]
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        out.write('\n'.join(paras) + '\n')

コード例 #19

0

ファイルを表示

def crawl_coislife_ie(crawler, out):
    links = set()
    for num in range(1, 12):
        if num > 1:
            listurl = 'https://www.coislife.ie/product-category/ga/page/%s/' % num
        else:
            listurl = 'https://www.coislife.ie/product-category/ga/'
        idxres = crawler.fetch(listurl)
        if idxres.status != 200:
            continue
        idxhtml = idxres.content.decode('utf-8')
        index = extract('<div class="products-archive--products">',
                        '<nav class="woocommerce-pagination">', idxhtml)
        for link in re.findall(
                r'<a href="(https://www.coislife.ie/product/[^"]+?)">', index):
            links.add(link)
    for url in sorted(links):
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        title = re.search(r'<title>(.+?)</title>', html)
        title = title.group(1).split('&#8211;')[0].strip() if title else ''
        desc = re.search(r'<meta property="og:description" content="([^"]+?)"',
                         html)
        desc = cleantext(desc.group(1))
        body = extract(
            '<div class="tab-content">',
            '<div class="entry-content in fade tab-pane" id="tab-additional_information">',
            html) or ''
        paras = clean_paragraphs(title + '<br/>' + body)
        pubdate = fetchresult.headers.get('Last-Modified')
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: Commerce\n')
            if desc:
                out.write('# Description: %s\n' % desc)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            for para in paras:
                if para.find('Léigh sliocht as an leabhar') >= 0:
                    continue
                else:
                    out.write(para + '\n')

コード例 #20

0

ファイルを表示

def _crawl_news_mn(crawler, out):
    index = crawler.fetch_content(
        'https://www.news.mn/api/v1/mongo/getNewsByLang?id=3')
    for i in sorted(set(item['newsId'] for item in json.loads(index))):
        url = 'https://www.news.mn/api/v1/news/%d/-1' % i
        doc = json.loads(crawler.fetch_content(url))
        pubDate = doc.get('publishDate', doc.get('createdAt'))
        if pubDate:
            pubDate = datetime.utcfromtimestamp(pubDate / 1000.0).isoformat()
        title = doc.get('title', '')
        html = '<h1>%s</h1>%s' % (doc.get('title', ''), doc.get(
            'infoHtml', ''))
        text = '\n'.join(clean_paragraphs(html))
        text = ''.join([QAGAN_CHARMAP.get(c, '') for c in text])
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubDate:
            out.write('# Publication-Date: %sZ\n' % pubDate)
        out.write(text + '\n')

コード例 #21

0

ファイルを表示

def crawl_forasnagaeilge_ie(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'https://www.forasnagaeilge.ie/sitemap_index.xml')
    pubdate_regex = re.compile(r'"datePublished":"([^"]+)",')
    for url in sorted(sitemap.keys()):
        orig_url = url
        if '?lang=en' in url:
            ga_url = url.replace('?lang=en', '')
            if ga_url in sitemap.keys():
                continue
        if '/blog-en/' in url:
            continue
        if '/corporate-information/' in url:
            continue
        if '/torthai-cuardaigh/' in url:
            continue
        fetchresult = crawler.fetch(url)
        if fetchresult.status != 200:
            continue
        html = fetchresult.content.decode('utf-8')
        if '<html class="no-js" lang="en">' in html:
            continue
        title = extract('<title>', ' - www.forasnagaeilge.ie</title>',
                        html) or ''
        pubdate_match = pubdate_regex.search(html)
        if pubdate_match:
            pubdate = pubdate_match.group(1)
        else:
            pubdate = sitemap.get(url) or sitemap[orig_url]
        body = extract('<div id="main" class="container">',
                       '</div><!-- /.content -->', html)
        if not body:
            continue
        paras = clean_paragraphs(body)
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            out.write('# Title: %s\n' % title)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #22

0

ファイルを表示

ファイル: crawl_sat.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_disom_khobor(crawler, out):
    for url in sorted(set(re.findall(
            r'http://wesanthals.tripod.com/(?:disomk02|DK-\d+)/[^"\']+',
            crawler.fetch('http://wesanthals.tripod.com/id43.html').content))):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        assert 'charset=ISO-8859-1' in doc.content
        html  = extract('sahta 1', '<hr', doc.content.decode('ISO-8859-1'))
        if not html:
            continue
        pubdate = max([_parse_date(d)
                       for d in re.findall(r'\d\d/\d\d/\d{2,4}', html)])
        html = html.replace(' ,', ',').replace(',', ', ')
        html = html.replace('(', ' (').replace(')', ') ')
        html = html.replace(') ,', '),')
        text = '\n'.join([_to_unicode(p) for p in clean_paragraphs(html)])
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        out.write('# Publication-Date: %s\n' % pubdate)
        out.write(text + '\n')

コード例 #23

0

ファイルを表示

def _crawl_raestdzinad_ru(crawler, out):
    urls = crawler.fetch_sitemap(
        urlencode('https://растдзинад.рф/sitemap_index.xml'))
    for url in sorted(urls):
        if re.search(r'/20\d{2}/', url) is None:
            continue
        html = crawler.fetch_content(url)
        title = extract('<h1 class="entry-title">', '</h1>', html) or ''
        text = extract('<div class="td-post-content">', '<footer>', html) or ''
        text = text.split('<div class = "evc-social-likes"')[0]
        pubdate = re.search(
            r'<meta property="article:published_time" content="([^"]+)"', html)
        if pubdate:
            pubdate = pubdate.group(1)
        paras = clean_paragraphs('%s<p/>%s' % (title, text))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #24

0

ファイルを表示

def _crawl_eestikirik_ee(crawler, out):
    for url in sorted(_find_urls_eestikirik_ee(crawler)):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = extract('<h1 class="entry_title">', '</h1>', html) or ''
        entry = extract('<div class="entry">', '<div style="min-height:33px;"',
                        html) or ''
        pubdate = re.search('(\d{1,2})\.(\d{1,2})\.(20\d{2})',
                            extract('<div id="content">', '</small>', html))
        if pubdate is not None:
            pubdate = '%04d-%02d-%02d' % (int(
                pubdate.group(3)), int(pubdate.group(2)), int(
                    pubdate.group(1)))
        paras = clean_paragraphs('%s<br/>%s' % (title, entry))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #25

0

ファイルを表示

ファイル: crawl_ca_valencia.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_val_levante_emv_com(crawler, out):
    urls = set()
    for url in crawler.fetch_sitemap('http://val.levante-emv.com/sitemap.xml'):
        url = url.replace('//www.levante-emv.com', '//val.levante-emv.com')
        if re.search(r'/\d{4}/\d{2}/\d{2}/', url) is not None:
            urls.add(url)
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        es_url = url.replace('//val.levante-emv.com', '//www.levante-emv.com')
        html = doc.content.decode('utf-8')
        pubdate = re.search(
            r'<meta name="cXenseParse:recs:publishtime" content="([^"]+)"',
            html)
        pubdate = pubdate.group(1) if pubdate else None
        title = extract('<span itemprop="articleBody">', '</h1>', html)
        subtitle = extract('<h2 itemprop="description">', '</h2>', html)
        content = extract('<span itemprop="articleBody">',
                          '</apertium-notrans>', html)
        paras = clean_paragraphs(''.join(
            ['<p>%s</p>' % p for p in (title, subtitle, content) if p]))
        text = '\n'.join(paras)
        for sep in ['Compartir en Twitter', 'HEMEROTECA\n', '\nPublicitat\n']:
            text = text.split(sep)[0].strip()
        if not text:
            continue
        if any(b in text for b in [
                'inicia sessió si eres subscriptor',
                'Si eres subscriptor inicia sessió',
                'Para continuar leyendo... suscríbete'
        ]):
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Translation.es: %s\n' % es_url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write(text + '\n')

コード例 #26

0

ファイルを表示

def _crawl_visao_sapo_pt(crawler, out):
    sitemap = crawler.fetch_sitemap(
        'http://visao.sapo.pt/sitemap/visao_index.xml')
    sitemap.update(
        crawler.fetch_sitemap('http://visao.sapo.pt/sitemap/visao_news.xml'))
    for url in sorted(sitemap):
        html = crawler.fetch_content(url)
        title = re.search(
            r'<meta name="twitter:title" property="og:title" '
            r'content="([^"]+)"', html)
        title = title.group(1) if title else ''
        pubdate = re.search(
            r'<p class="timeStamp publishedDate" datetime="'
            r'([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else None
        body = extract('<div class="afterHeader">', '<footer', html) or ''
        paras = clean_paragraphs('%s<p>%s' % (title, body))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')

コード例 #27

0

ファイルを表示

ファイル: crawl_ba.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_yaikrb_ru(crawler, out):
    urls = set()
    # The site has an incomplete sitemap, so we also look at the archive pages.
    sitemap = crawler.fetch_sitemap('http://yaikrb.ru/sitemap.xml')
    for url in sorted(sitemap):
        crawler.fetch_content(url)
    main = crawler.fetch_content('http://yaikrb.ru/')
    archives = set([str(x) for x in range(1, 150)])
    archives.update(re.findall(r'/xf/num/([^/]+)/', main))
    for a in sorted(archives):
        doc = crawler.fetch(urlencode('http://yaikrb.ru/xf/num/%s/' % a))
        if doc.status != 200:
            continue
        for href in re.findall(r'<div class="n_more"><a href="([^"]+)"',
                               doc.content.decode('utf-8')):
            urls.add(urljoin('http://yaikrb.ru/', href, allow_fragments=False))
    for url in sorted(urls):
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = re.search('<meta property="og:title" content="([^"]+)"', html)
        title = title.group(1) if title else ''
        text = extract('<div class="n_text">', '<div class="n_oth">', html)
        paras = clean_paragraphs('<h1>' + title + '</h1>' + text)
        if not paras:
            continue
        pubdate = re.search(
            r'<small>(\d{1,2})\.(\d{1,2})\.(20\d{2})\s*</small></h1>', html)
        if pubdate is not None:
            pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2),
                                    pubdate.group(1))
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

コード例 #28

0

ファイルを表示

ファイル: crawl_my_t_d0_zawgyi.py プロジェクト: zhezhe123/corpuscrawler

def _crawl_than_lwin_times(crawler, out):
    urls = find_wordpress_urls(crawler, 'http://thanlwintimes.com/')
    for url in sorted(urls):
        if not url.endswith('/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(
            r'<time class="entry-date updated td-module-date" '
            r'datetime="([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else ''
        title = (extract('<title>', '</title>', html) or '').split('|')[0]
        body = extract('<div class="td-post-content">',
                       "<div class='sfsi_Sicons'", html) or ''
        body = body.split('Please follow and like us')[0]
        paragraphs = clean_paragraphs('%s<br/>%s' % (title, body))
        if len(paragraphs) > 0:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paragraphs) + '\n')

コード例 #29

0

ファイルを表示

def _crawl_dnevnik_si(crawler, out):
    urls = set()
    for url in crawler.fetch_sitemap('https://www.dnevnik.si/sitemap'):
        match = re.search(r'#(\d+)$', url)
        if not match:
            match = re.search(r'dnevnik\.si/(\d+)', url)
        if match:
            urls.add('https://www.dnevnik.si/' + match.group(1))
    for url in sorted(urls):
        doc = crawler.fetch_content(url)
        title = re.search(r'<meta name="og:title" content="(.+?)"', doc)
        title = title.group(1).replace('&amp;', '&') if title else ''
        pubdate = re.search(r'<div class="dtstamp" title="(.+?)">', doc)
        pubdate = pubdate.group(1).strip() if pubdate else None
        text = extract('<div class="article-body article-wrap">',
                       '<div class="article-tags">', doc) or ''
        paras = clean_paragraphs(title + '<br/>' + text.replace('\r', '\n'))
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        out.write('\n'.join(paras) + '\n')

コード例 #30

0

ファイルを表示

def _crawl_svaboda_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.svaboda.org/sitemap.xml')
    for url in sorted(sitemap):
        if (url == 'https://www.svaboda.org/' or
                url.startswith('https://www.svaboda.org/z/')):  # index pages
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        title = extract('<title>', '</title>', html) or ''
        pubdate = re.search(
            r'<div class="published">\s*<span class="date"\s*>'
            r'\s*<time datetime="([^"]+)"', html)
        pubdate = pubdate.group(1) if pubdate else None
        body = extract('<div class="body-container">', '<div id="comments"',
                       html) or ''
        paras = clean_paragraphs('%s<p/>%s' % (title, body))
        if paras:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')