コード例 #1
0
def _pl_usembassy_gov_path(url):
    if not urlpath(url).startswith('/pl/'):
        return False
    else:
        if urlpath(url) == '/pl/':
            return False
        elif urlpath(url).startswith('/pl/category/'):
            return False
        elif urlpath(url).startswith('/pl/tag/'):
            return False
        else:
            return True
コード例 #2
0
ファイル: crawl_ky.py プロジェクト: keshan/corpuscrawler
def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '</div>', html)
        if not title or not text:
            continue
        paras = [title] + re.sub(r'<br\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')
コード例 #3
0
def crawl_azattyk_org(crawler, out):
    sitemap = crawler.fetch_sitemap('https://www.azattyk.org/sitemap.xml')
    for url in sorted(sitemap.keys()):
        if not urlpath(url).startswith('/a/'):
            continue
        doc = crawler.fetch(url)
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        pubdate = re.search(r'"dateModified":"([^"]+)"', html)
        if pubdate is not None:
            pubdate = cleantext(pubdate.group(1)).replace(' ', 'T')
        title = extract('<title>', '</title>', html)
        text = extract('content-offset">', '<footer', html)
        if not title or not text:
            continue
        text = text.split('<span class="share')[0]
        text = text.split('<div class="region"')[0]
        text = text.replace('\n', ' ')
        paras = [title] + re.sub(r'<(?:br|p|div)\s*?/?>', '\n', text).splitlines()
        paras = filter(None, [cleantext(p.strip()) for p in paras])
        paras = [p for p in paras if not p.startswith('http')]
        if not paras:
            continue
        # Filter out English text.
        if ord(paras[0][0]) <= 0xFF or ord(paras[-1][-1]) <= 0xFF:
            continue
        out.write('# Location: %s\n' % url)
        out.write('# Genre: News\n')
        if pubdate:
            out.write('# Publication-Date: %s\n' % pubdate)
        for p in paras:
            out.write(p + '\n')
コード例 #4
0
def crawl_pl_usembassy_gov(crawler, out):
    sitemap = crawler.fetch_sitemap('https://pl.usembassy.gov/sitemap_index.xml')
    trans_regex = re.compile(
        r'<h3>Tłumaczenie</h3><div class="translations_sidebar"><ul><li><a href ?="([^"]*)"'
    )
    pubdate_regex = re.compile(
        r'<meta property="article:published_time" content="([^"]*)"'
    )
    links = set()
    for key in sorted(sitemap.keys()):
        if _pl_usembassy_gov_path(key):
            links.add(key)
    for link in sorted(links):
        result = crawler.fetch(link)
        if result.status != 200:
            continue
        html = result.content.decode('utf-8')
        title = extract('<title>', '</title>', html)
        title = title if title else ''
        title = title.split(' | ')[0] if ' | ' in title else title
        pubdate_match = pubdate_regex.search(html)
        pubdate = pubdate_match.group(1) if pubdate_match else None
        trans_match = trans_regex.search(html)
        trans = trans_match.group(1) if trans_match else None
        if pubdate is None: pubdate = result.headers.get('Last-Modified')
        if pubdate is None: pubdate = sitemap[link]
        exstart = '<div class="entry-content">'
        exstart2 = '<div class="mo-page-content">'
        exend = '<!-- AddThis Advanced Settings above via filter on the_content -->'
        exstart = exstart2 if exstart2 in html else exstart
        content = extract(exstart, exend, html)
        cleanparas = clean_paragraphs(content) if content else None
        # Don't repeat the title if it's the only text content
        cleantitle = cleantext(title)
        if cleanparas:
            if len(cleanparas) == 1 and cleanparas[0] == cleantitle:
                paras = [cleantitle]
            else:
                paras = [cleantitle] + cleanparas
        else:
            paras = [cleantitle]
        # There are quite a few media pages whose only text is the filename
        # this, conveniently, is typically also the post's name
        if len(paras) == 1 and paras[0].lower() in urlpath(link).lower():
            continue
        if paras:
            out.write('# Location: %s\n' % link)
            out.write('# Genre: Diplomatic\n')
            if trans:
                out.write('# Translation: %s\n' % trans)
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            out.write('\n'.join(paras) + '\n')
コード例 #5
0
def crawl_dimma_fo(crawler, out):
    num_pages = int(re.search(
        r'<a href="http://www.dimma.fo/(\d+)" class="to-last"',
        crawler.fetch('http://www.dimma.fo/').content).group(1))
    urls = set()
    for i in range(1, num_pages + 1):
        doc = crawler.fetch('http://www.dimma.fo/%d' % i)
        html = doc.content.decode('utf-8')
        for u in re.findall(r'href="(http://www.dimma.fo/[^"]+?)"', html):
            path = urlpath(u)
            if re.match(r'/\d+', path) or u'/' in path[1:]:
                continue
            urls.add(u)
    for url in sorted(urls):
        doc = crawler.fetch(urlencode(url))
        if doc.status != 200:
            continue
        html = doc.content.decode('utf-8')
        content = html.split('class="content">')[1]
        pubdate = re.search(
            r'<span class="date">\s*'
            r'(\d{4}-\d{2}-\d{2})\s+(\d{2}:\d{2})\s*</span>',
            content)
        if pubdate != None:
            pubdate = '%sT%s:00+01:00' % (pubdate.group(1), pubdate.group(2))
        paragraphs = []
        title = re.search(r'<h1>(.+?)</h1>', html, flags=re.DOTALL)
        if title != None:
            paragraphs.append(cleantext(title.group(1)))
        text = content.split('<p>', 1)[1].split('</div>')[0]
        text = text.replace('\n', ' ').replace('</p>', '\n')
        text = text.replace('<br />', '\n')
        paragraphs.extend([cleantext(p) for p in text.splitlines()])
        paragraphs = filter(None, paragraphs)
        if paragraphs:
            out.write('# Location: %s\n' % url)
            out.write('# Genre: News\n')
            if pubdate:
                out.write('# Publication-Date: %s\n' % pubdate)
            for p in paragraphs:
                out.write(p + '\n')
コード例 #6
0
def _rtenuacht_path(url):
    rtenuacht = urlpath(url).startswith('/news/nuacht/')
    rnagnuacht = urlpath(url).startswith('/rnag/nuacht-gaeltachta')
    return rtenuacht or rnagnuacht
コード例 #7
0
def _rtenuacht_path(url):
    rtenuacht = urlpath(url).startswith('/news/nuacht/')
    rnag = '/rnag/nuacht' in url or '/rnag/articles' in url
    return rtenuacht or rnag