def crawl_gsw_seislerblog(crawler): urls = set() for i in range(1, 16): indexurl = ('http://www.freiburger-nachrichten.ch/blogs/seislerblog' '?page=%d' % i) html = crawler.fetch(indexurl).content.decode('utf-8') for url in re.findall(r'<a href="(/blogs/seislerblog/.+?)[\s"]', html): urls.add(urljoin(indexurl, url)) out = crawler.get_output('gsw-u-sd-chfr') for url in sorted(urls): out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') text = crawler.fetch(url).content.decode('utf-8') pubdate = re.search( r'<span class="date-created">([0-9]{1,2})\.([0-9]{2})\.' '(20[0-9]{2})</span>', text) if pubdate != None: day, month, year = pubdate.groups() pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day)) out.write('# Publication-Date: %s\n' % pubdate) text = text.split('<h1>', 1)[-1].split('<section')[0] text = text.replace('\n', ' ') for tag in ('</p>', '</h1>', '</div>'): text = text.replace(tag, '\n') for p in [ ' '.join(striptags(t).strip().split()) for t in text.splitlines() ]: if p and p != 'Kommentare': out.write(p + '\n')
def crawl_gsw_derbund(crawler): urls = set() for i in range(1, 200): url = ('https://www.derbund.ch/ajax/tags.html?' 'action=moreDossierStories§ion_id=11127&page=%d' '&dossier_id=3069' % i) items = json.loads(crawler.fetch(url).content)['items'] for path in re.findall(r'<a href="(.+?)"', ''.join(items)): if not path.startswith('/stichwort/autor/'): urls.add(urljoin('https://www.derbund.ch/', path)) if len(items) == 0: break out = crawler.get_output('gsw-u-sd-chbe') for url in sorted(urls): text = crawler.fetch(url).content.decode('utf-8') pubdate = re.search(r'Erstellt: ([0-9]{1,2})\.([0-9]{2})\.([0-9]{4})', text) if pubdate is not None: day, month, year = pubdate.groups() pubdate = '%04d-%02d-%02d' % (int(year), int(month), int(day)) out.write('# Location: %s\n' % url) out.write('# Genre: Blog\n') if pubdate is not None: out.write('# Publication-Date: %s\n' % pubdate) text = text.split('<div id="mainContent">')[1] text = text.split('<span class"idcode"')[0].split('(Der Bund)')[0] text = text.replace('***', ' ') if text.find('var badwordserch = 1;') >= 0: text = text.split('var badwordserch = 1;', 1)[1] paras = [' '.join(striptags(p).split()) for p in text.split('</p>')] for p in paras: if p: out.write(p + '\n')
def _find_tirreno_urls(crawler, category): site = 'http://iltirreno.gelocal.it/' urls = set() caturl = site + category catpage = crawler.fetch_content(caturl) num_pages = re.search(r'Pagina <span class="active">\d+</span> di (\d+)', catpage) baseurl = re.search(r'<a title="Vai a pagina 1" href="([^"]+)"', catpage) if num_pages is None or baseurl is None: return urls num_pages = int(num_pages.group(1)) baseurl = urljoin(site, baseurl.group(1)) for p in range(1, num_pages + 1): url = '%s?page=%d' % (baseurl, p) if p > 1 else baseurl content = crawler.fetch_content(url) for u in re.findall(r'<h1><a href="([^"]+)">', content): u = urljoin(site, replace_html_entities(u.strip())) if not u.startswith('http://old.iltirreno.gelocal.it/'): urls.add(u) return urls
def _find_urls_utusan_borneo_berita_iban(crawler): urls = set() main = crawler.fetch_content('http://www.utusanborneo.com.my/iban') num_pages = max(map(int, re.findall('\?page=(\d+)', main))) for p in range(0, num_pages): index_url = 'http://www.utusanborneo.com.my/iban' if p > 0: index_url = index_url + '?page=%d' % p for url in re.findall(r'href="(/\d{4}/\d{2}/\d{2}/[^"]+)"', crawler.fetch_content(index_url)): urls.add(urljoin(index_url, url)) return urls
def crawl_newsbook_mt(crawler, out): urls = set() for section in ('internazzjonali', 'muzika', 'madwar-il-hajja', 'teknologijja', 'vatikan', 'sports', 'kummerc'): section_url = 'http://www.newsbook.com.mt/artikli/%s/' % section html = crawler.fetch(section_url).content.decode('utf-8') links = re.findall(r'/artikli/%s/(\d+)/' % section, html) num_toc_pages = max([int(x) for x in links]) for i in range(1, num_toc_pages + 1): toc_url = section_url if i > 1: toc_url = toc_url + '%d/' % i html = crawler.fetch(toc_url).content.decode('utf-8') for u in re.findall('href="(/artikli/\d{4}/.+?)"', html): url = urljoin(toc_url, u) if url.find('/test') < 0: urls.add(url) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search(r'<meta content="([^"]+?)" name="title"', html) if title is not None: title = cleantext(title.group(1)) pubdate = re.search( r'<meta content="([^"]+?)" itemprop="datePublished"', html) if pubdate is not None: pubdate = pubdate.group(1).strip().replace(' ', 'T') + 'Z' content = html.split('<p>', 1)[1].split('<div', 1)[0] content = content.replace('\n', ' ').replace('</p>', '\n') paras = [ fixquotes(cleantext(p)) for p in [title] + content.splitlines() ] paras = filter(None, paras) if not paras: continue out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) for p in paras: out.write(p + '\n')
def _crawl_yaikrb_ru(crawler, out): urls = set() # The site has an incomplete sitemap, so we also look at the archive pages. sitemap = crawler.fetch_sitemap('http://yaikrb.ru/sitemap.xml') for url in sorted(sitemap): crawler.fetch_content(url) main = crawler.fetch_content('http://yaikrb.ru/') archives = set([str(x) for x in range(1, 150)]) archives.update(re.findall(r'/xf/num/([^/]+)/', main)) for a in sorted(archives): doc = crawler.fetch(urlencode('http://yaikrb.ru/xf/num/%s/' % a)) if doc.status != 200: continue for href in re.findall(r'<div class="n_more"><a href="([^"]+)"', doc.content.decode('utf-8')): urls.add(urljoin('http://yaikrb.ru/', href, allow_fragments=False)) for url in sorted(urls): doc = crawler.fetch(url) if doc.status != 200: continue html = doc.content.decode('utf-8') title = re.search('<meta property="og:title" content="([^"]+)"', html) title = title.group(1) if title else '' text = extract('<div class="n_text">', '<div class="n_oth">', html) paras = clean_paragraphs('<h1>' + title + '</h1>' + text) if not paras: continue pubdate = re.search( r'<small>(\d{1,2})\.(\d{1,2})\.(20\d{2})\s*</small></h1>', html) if pubdate is not None: pubdate = '%s-%s-%s' % (pubdate.group(3), pubdate.group(2), pubdate.group(1)) out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')
def _crawl_rtl_lu(crawler, out): urls = set() homepage = crawler.fetch_content('http://www.rtl.lu/') cats = extract('<!-- MAIN NAVIGATION -->', '</header>', homepage) for cat in re.findall(r'href="(https?://www\.rtl\.lu/[^"]+?)">', cats): caturl = cat + 'archiv/' if cat.find('/sport/') > 0: caturl = caturl + 'all/' doc = crawler.fetch(caturl) if doc.status != 200: continue content = doc.content.decode('utf-8') num_pages = re.search(r'archiv\?p=(\d+)" class="last">»', content) num_pages = int(num_pages.group(1)) if num_pages else 0 for p in range(1, num_pages + 1): page = crawler.fetch_content(caturl[:-1] + '?p=%d' % p) html = extract('<div class="teaser archive-header">', '<div class="pager">', page) if not html: continue for url in re.findall(r'href="([^"]+?)"', html): urls.add(urlencode(urljoin('http://www.rtl.lu/', url))) for url in sorted(urls): if url in BLACKLIST: continue doc = crawler.fetch_content(url) header = extract('<header>', '</header>', doc) or '' if header: header = header.replace('</span>', ' ') pubdate = re.search( '(\d{1,2})\.(\d{1,2}).(20\d{2}), (\d\d):(\d\d):(\d\d)</li>', doc) if pubdate: pd = [int(x) for x in pubdate.groups()] pubdate = '%04d-%02d-%02dT%02d:%02d:%02d+02:00' % ( pd[2], pd[1], pd[0], pd[3], pd[4], pd[5]) if doc.find('<section class="mainbar-right omega body">') > 0: start_tag = '<section class="mainbar-right omega body">' else: start_tag = '<p>' content = extract(start_tag, '<!-- BEGIN Comments -->', doc) or '' content = re.sub(r'<script.+?</script>', '', content, flags=re.DOTALL) content = re.sub(r'<form.+?</form>', '', content, flags=re.DOTALL) content = content.split('<footer')[0] content = content.split('<div class="pager"')[0] paras = clean_paragraphs(header + '<p/>' + content) paras = [ p for p in paras if (p.find('Vous souhaitez faire') < 0 and p != 'äre Commentaire' and not p.startswith('####')) ] text = '\n'.join(paras) # Filter out some articles in French or German. if (text.find(' est ') >= 0 or text.find(' ist ') >= 0 or text.find(' Ist ') >= 0 or text.find(' dit ') >= 0 or text.find(' veut') >= 0): continue if paras: out.write('# Location: %s\n' % url) out.write('# Genre: News\n') if pubdate: out.write('# Publication-Date: %s\n' % pubdate) out.write('\n'.join(paras) + '\n')